2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# kconv.rb - Kanji Converter.
|
|
|
|
#
|
|
|
|
# $Id$
|
|
|
|
#
|
2006-06-19 10:40:23 -04:00
|
|
|
# ----
|
|
|
|
#
|
|
|
|
# kconv.rb implements the Kconv class for Kanji Converter. Additionally,
|
|
|
|
# some methods in String classes are added to allow easy conversion.
|
|
|
|
#
|
2005-06-24 22:50:50 -04:00
|
|
|
|
1999-08-13 01:37:52 -04:00
|
|
|
require 'nkf'
|
|
|
|
|
2006-06-19 10:40:23 -04:00
|
|
|
#
|
|
|
|
# Kanji Converter for Ruby.
|
|
|
|
#
|
1999-08-13 01:37:52 -04:00
|
|
|
module Kconv
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# Public Constants
|
|
|
|
#
|
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#Constant of Encoding
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
# Auto-Detect
|
|
|
|
AUTO = NKF::AUTO
|
|
|
|
# ISO-2022-JP
|
|
|
|
JIS = NKF::JIS
|
|
|
|
# EUC-JP
|
|
|
|
EUC = NKF::EUC
|
|
|
|
# Shift_JIS
|
|
|
|
SJIS = NKF::SJIS
|
|
|
|
# BINARY
|
|
|
|
BINARY = NKF::BINARY
|
|
|
|
# NOCONV
|
|
|
|
NOCONV = NKF::NOCONV
|
|
|
|
# ASCII
|
|
|
|
ASCII = NKF::ASCII
|
|
|
|
# UTF-8
|
|
|
|
UTF8 = NKF::UTF8
|
|
|
|
# UTF-16
|
|
|
|
UTF16 = NKF::UTF16
|
|
|
|
# UTF-32
|
|
|
|
UTF32 = NKF::UTF32
|
|
|
|
# UNKNOWN
|
|
|
|
UNKNOWN = NKF::UNKNOWN
|
2006-06-19 10:40:23 -04:00
|
|
|
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# Private Constants
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# Revision of kconv.rb
|
2005-07-05 11:19:36 -04:00
|
|
|
REVISION = %q$Revision$
|
2005-06-27 12:53:48 -04:00
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#Regexp of Encoding
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
# Regexp of Shift_JIS string (private constant)
|
2004-10-29 02:51:33 -04:00
|
|
|
RegexpShiftjis = /\A(?:
|
|
|
|
[\x00-\x7f\xa1-\xdf] |
|
|
|
|
[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]
|
|
|
|
)*\z/nx
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
# Regexp of EUC-JP string (private constant)
|
2004-10-29 02:51:33 -04:00
|
|
|
RegexpEucjp = /\A(?:
|
|
|
|
[\x00-\x7f] |
|
|
|
|
\x8e [\xa1-\xdf] |
|
2006-08-21 23:41:46 -04:00
|
|
|
\x8f [\xa1-\xfe] [\xa1-\xfe] |
|
|
|
|
[\xa1-\xfe] [\xa1-\xfe]
|
2004-10-29 02:51:33 -04:00
|
|
|
)*\z/nx
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
# Regexp of UTF-8 string (private constant)
|
2004-10-29 02:51:33 -04:00
|
|
|
RegexpUtf8 = /\A(?:
|
|
|
|
[\x00-\x7f] |
|
|
|
|
[\xc2-\xdf] [\x80-\xbf] |
|
|
|
|
\xe0 [\xa0-\xbf] [\x80-\xbf] |
|
|
|
|
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
|
|
|
|
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
|
|
|
|
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
|
|
|
|
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
|
|
|
|
)*\z/nx
|
2006-03-27 08:48:21 -05:00
|
|
|
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# Public Methods
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.kconv(str, out_code, in_code = Kconv::AUTO)
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to out_code.
|
|
|
|
# <code>out_code</code> and <code>in_code</code> are given as constants of Kconv.
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# *Note*
|
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
|
|
|
# If you don't want to decode them, use NKF.nkf.
|
|
|
|
def kconv(str, out_code, in_code = AUTO)
|
|
|
|
opt = '-'
|
|
|
|
case in_code
|
|
|
|
when ::NKF::JIS
|
|
|
|
opt << 'J'
|
|
|
|
when ::NKF::EUC
|
|
|
|
opt << 'E'
|
|
|
|
when ::NKF::SJIS
|
|
|
|
opt << 'S'
|
|
|
|
when ::NKF::UTF8
|
|
|
|
opt << 'W'
|
|
|
|
when ::NKF::UTF16
|
|
|
|
opt << 'W16'
|
2006-09-15 07:26:07 -04:00
|
|
|
when ::NKF::UTF32
|
|
|
|
opt << 'W32'
|
2006-03-27 08:48:21 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
case out_code
|
|
|
|
when ::NKF::JIS
|
|
|
|
opt << 'j'
|
|
|
|
when ::NKF::EUC
|
|
|
|
opt << 'e'
|
|
|
|
when ::NKF::SJIS
|
|
|
|
opt << 's'
|
|
|
|
when ::NKF::UTF8
|
|
|
|
opt << 'w'
|
|
|
|
when ::NKF::UTF16
|
|
|
|
opt << 'w16'
|
2006-09-15 07:26:07 -04:00
|
|
|
when ::NKF::UTF32
|
|
|
|
opt << 'w32'
|
2006-03-27 08:48:21 -05:00
|
|
|
when ::NKF::NOCONV
|
2005-06-24 22:50:50 -04:00
|
|
|
return str
|
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
opt = '' if opt == '-'
|
|
|
|
|
|
|
|
::NKF::nkf(opt, str)
|
2005-06-24 22:50:50 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :kconv
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#
|
|
|
|
# Encode to
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.tojis(str) -> string
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to ISO-2022-JP
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-jxm0', str).
|
1999-08-13 01:37:52 -04:00
|
|
|
def tojis(str)
|
2006-06-19 10:40:23 -04:00
|
|
|
::NKF::nkf('-jm', str)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :tojis
|
1999-08-13 01:37:52 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.toeuc(str) -> string
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to EUC-JP
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-exm0', str).
|
1999-08-13 01:37:52 -04:00
|
|
|
def toeuc(str)
|
2006-09-05 23:20:10 -04:00
|
|
|
::NKF::nkf('-em', str)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :toeuc
|
1999-08-13 01:37:52 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.tosjis(str) -> string
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to Shift_JIS
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-sxm0', str).
|
1999-08-13 01:37:52 -04:00
|
|
|
def tosjis(str)
|
2006-06-19 10:40:23 -04:00
|
|
|
::NKF::nkf('-sm', str)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :tosjis
|
1999-08-13 01:37:52 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.toutf8(str) -> string
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to UTF-8
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-wxm0', str).
|
2004-10-29 02:51:33 -04:00
|
|
|
def toutf8(str)
|
2006-06-19 10:40:23 -04:00
|
|
|
::NKF::nkf('-wm', str)
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :toutf8
|
2004-10-29 02:51:33 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.toutf16(str) -> string
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to UTF-16
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-w16xm0', str).
|
2004-10-29 02:51:33 -04:00
|
|
|
def toutf16(str)
|
2006-06-19 10:40:23 -04:00
|
|
|
::NKF::nkf('-w16m', str)
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :toutf16
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2006-09-15 07:26:07 -04:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.toutf32(str) -> string
|
|
|
|
#
|
|
|
|
# Convert <code>str</code> to UTF-32
|
|
|
|
#
|
|
|
|
# *Note*
|
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
|
|
|
# If you don't want it, use NKF.nkf('-w32xm0', str).
|
|
|
|
def toutf32(str)
|
|
|
|
::NKF::nkf('-w32m', str)
|
|
|
|
end
|
|
|
|
module_function :toutf32
|
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#
|
|
|
|
# guess
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.guess(str) -> integer
|
|
|
|
#
|
|
|
|
# Guess input encoding by NKF.guess2
|
1999-08-13 01:37:52 -04:00
|
|
|
def guess(str)
|
2004-10-29 02:51:33 -04:00
|
|
|
::NKF::guess(str)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :guess
|
2004-10-29 02:51:33 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.guess_old(str) -> integer
|
|
|
|
#
|
|
|
|
# Guess input encoding by NKF.guess1
|
2004-10-29 02:51:33 -04:00
|
|
|
def guess_old(str)
|
2005-01-24 02:10:44 -05:00
|
|
|
::NKF::guess1(str)
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :guess_old
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#
|
|
|
|
# isEncoding
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.iseuc(str) -> obj or nil
|
|
|
|
#
|
|
|
|
# Returns whether input encoding is EUC-JP or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
2004-10-29 02:51:33 -04:00
|
|
|
def iseuc(str)
|
|
|
|
RegexpEucjp.match( str )
|
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :iseuc
|
|
|
|
|
|
|
|
# call-seq:
|
|
|
|
# Kconv.issjis(str) -> obj or nil
|
|
|
|
#
|
|
|
|
# Returns whether input encoding is Shift_JIS or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
2004-10-29 02:51:33 -04:00
|
|
|
def issjis(str)
|
|
|
|
RegexpShiftjis.match( str )
|
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :issjis
|
2004-10-29 02:51:33 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.isutf8(str) -> obj or nil
|
|
|
|
#
|
|
|
|
# Returns whether input encoding is UTF-8 or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
2004-10-29 02:51:33 -04:00
|
|
|
def isutf8(str)
|
|
|
|
RegexpUtf8.match( str )
|
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :isutf8
|
2004-10-29 02:51:33 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
class String
|
|
|
|
# call-seq:
|
|
|
|
# String#kconv(out_code, in_code = Kconv::AUTO)
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# Convert <code>self</code> to out_code.
|
|
|
|
# <code>out_code</code> and <code>in_code</code> are given as constants of Kconv.
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want to decode them, use NKF.nkf.
|
|
|
|
def kconv(out_code, in_code=Kconv::AUTO)
|
|
|
|
Kconv::kconv(self, out_code, in_code)
|
2005-06-24 22:50:50 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
#
|
|
|
|
# to Encoding
|
|
|
|
#
|
|
|
|
|
|
|
|
# call-seq:
|
|
|
|
# String#tojis -> string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to ISO-2022-JP
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-jxm0', str).
|
|
|
|
def tojis; Kconv.tojis(self) end
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#toeuc -> string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to EUC-JP
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-exm0', str).
|
|
|
|
def toeuc; Kconv.toeuc(self) end
|
2005-08-17 08:59:57 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#tosjis -> string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to Shift_JIS
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-sxm0', str).
|
|
|
|
def tosjis; Kconv.tosjis(self) end
|
2005-08-17 08:59:57 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#toutf8 -> string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to UTF-8
|
|
|
|
#
|
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-wxm0', str).
|
|
|
|
def toutf8; Kconv.toutf8(self) end
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#toutf16 -> string
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# Convert <code>self</code> to UTF-16
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# *Note*
|
2006-06-19 10:40:23 -04:00
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
2006-03-27 08:48:21 -05:00
|
|
|
# If you don't want it, use NKF.nkf('-w16xm0', str).
|
|
|
|
def toutf16; Kconv.toutf16(self) end
|
2005-08-17 08:59:57 -04:00
|
|
|
|
2006-09-15 07:26:07 -04:00
|
|
|
# call-seq:
|
|
|
|
# String#toutf32 -> string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to UTF-32
|
|
|
|
#
|
|
|
|
# *Note*
|
|
|
|
# This method decode MIME encoded string and
|
|
|
|
# convert halfwidth katakana to fullwidth katakana.
|
|
|
|
# If you don't want it, use NKF.nkf('-w32xm0', str).
|
|
|
|
def toutf32; Kconv.toutf32(self) end
|
|
|
|
|
2005-08-17 08:59:57 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# is Encoding
|
2005-08-17 08:59:57 -04:00
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#iseuc -> obj or nil
|
|
|
|
#
|
|
|
|
# Returns whether <code>self</code>'s encoding is EUC-JP or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
|
|
|
def iseuc; Kconv.iseuc(self) end
|
1999-10-29 05:25:48 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#issjis -> obj or nil
|
|
|
|
#
|
|
|
|
# Returns whether <code>self</code>'s encoding is Shift_JIS or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
|
|
|
def issjis; Kconv.issjis(self) end
|
|
|
|
|
|
|
|
# call-seq:
|
|
|
|
# String#isutf8 -> obj or nil
|
|
|
|
#
|
|
|
|
# Returns whether <code>self</code>'s encoding is UTF-8 or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
|
|
|
def isutf8; Kconv.isutf8(self) end
|
1999-10-29 05:25:48 -04:00
|
|
|
end
|