2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# kconv.rb - Kanji Converter.
|
|
|
|
#
|
|
|
|
# $Id$
|
|
|
|
#
|
2006-06-19 10:40:23 -04:00
|
|
|
# ----
|
|
|
|
#
|
|
|
|
# kconv.rb implements the Kconv class for Kanji Converter. Additionally,
|
|
|
|
# some methods in String classes are added to allow easy conversion.
|
|
|
|
#
|
2005-06-24 22:50:50 -04:00
|
|
|
|
1999-08-13 01:37:52 -04:00
|
|
|
require 'nkf'
|
|
|
|
|
2006-06-19 10:40:23 -04:00
|
|
|
#
|
|
|
|
# Kanji Converter for Ruby.
|
|
|
|
#
|
1999-08-13 01:37:52 -04:00
|
|
|
module Kconv
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# Public Constants
|
|
|
|
#
|
2007-12-19 05:19:38 -05:00
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#Constant of Encoding
|
2007-12-19 05:19:38 -05:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# Auto-Detect
|
|
|
|
AUTO = NKF::AUTO
|
|
|
|
# ISO-2022-JP
|
|
|
|
JIS = NKF::JIS
|
|
|
|
# EUC-JP
|
|
|
|
EUC = NKF::EUC
|
|
|
|
# Shift_JIS
|
|
|
|
SJIS = NKF::SJIS
|
|
|
|
# BINARY
|
|
|
|
BINARY = NKF::BINARY
|
|
|
|
# NOCONV
|
|
|
|
NOCONV = NKF::NOCONV
|
|
|
|
# ASCII
|
|
|
|
ASCII = NKF::ASCII
|
|
|
|
# UTF-8
|
|
|
|
UTF8 = NKF::UTF8
|
|
|
|
# UTF-16
|
|
|
|
UTF16 = NKF::UTF16
|
|
|
|
# UTF-32
|
|
|
|
UTF32 = NKF::UTF32
|
|
|
|
# UNKNOWN
|
|
|
|
UNKNOWN = NKF::UNKNOWN
|
2006-06-19 10:40:23 -04:00
|
|
|
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
|
|
|
# Public Methods
|
|
|
|
#
|
2009-03-05 22:56:38 -05:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2007-12-19 05:19:38 -05:00
|
|
|
# Kconv.kconv(str, to_enc, from_enc=nil)
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to out_code.
|
|
|
|
# <code>out_code</code> and <code>in_code</code> are given as constants of Kconv.
|
2007-12-19 05:19:38 -05:00
|
|
|
def kconv(str, to_enc, from_enc=nil)
|
|
|
|
opt = ''
|
2007-12-23 04:37:51 -05:00
|
|
|
opt += ' --ic=' + from_enc.to_s if from_enc
|
|
|
|
opt += ' --oc=' + to_enc.to_s if to_enc
|
2006-03-27 08:48:21 -05:00
|
|
|
|
|
|
|
::NKF::nkf(opt, str)
|
2005-06-24 22:50:50 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :kconv
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#
|
|
|
|
# Encode to
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.tojis(str) => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to ISO-2022-JP
|
1999-08-13 01:37:52 -04:00
|
|
|
def tojis(str)
|
2007-12-22 03:36:30 -05:00
|
|
|
kconv(str, JIS)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :tojis
|
1999-08-13 01:37:52 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.toeuc(str) => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to EUC-JP
|
1999-08-13 01:37:52 -04:00
|
|
|
def toeuc(str)
|
2007-12-22 03:36:30 -05:00
|
|
|
kconv(str, EUC)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :toeuc
|
1999-08-13 01:37:52 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.tosjis(str) => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to Shift_JIS
|
1999-08-13 01:37:52 -04:00
|
|
|
def tosjis(str)
|
2007-12-22 03:36:30 -05:00
|
|
|
kconv(str, SJIS)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :tosjis
|
1999-08-13 01:37:52 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.toutf8(str) => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to UTF-8
|
2004-10-29 02:51:33 -04:00
|
|
|
def toutf8(str)
|
2007-12-22 03:36:30 -05:00
|
|
|
kconv(str, UTF8)
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :toutf8
|
2004-10-29 02:51:33 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.toutf16(str) => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to UTF-16
|
2004-10-29 02:51:33 -04:00
|
|
|
def toutf16(str)
|
2007-12-22 03:36:30 -05:00
|
|
|
kconv(str, UTF16)
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :toutf16
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2006-09-15 07:26:07 -04:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.toutf32(str) => string
|
2006-09-15 07:26:07 -04:00
|
|
|
#
|
|
|
|
# Convert <code>str</code> to UTF-32
|
|
|
|
def toutf32(str)
|
2007-12-22 03:36:30 -05:00
|
|
|
kconv(str, UTF32)
|
2006-09-15 07:26:07 -04:00
|
|
|
end
|
|
|
|
module_function :toutf32
|
|
|
|
|
2008-01-01 10:22:25 -05:00
|
|
|
# call-seq:
|
|
|
|
# Kconv.tolocale => string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to locale encoding
|
2008-01-14 09:37:29 -05:00
|
|
|
def tolocale(str)
|
2008-01-01 10:22:25 -05:00
|
|
|
kconv(str, Encoding.locale_charmap)
|
|
|
|
end
|
|
|
|
module_function :tolocale
|
|
|
|
|
2004-10-29 02:51:33 -04:00
|
|
|
#
|
|
|
|
# guess
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.guess(str) => encoding
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
2007-12-19 05:19:38 -05:00
|
|
|
# Guess input encoding by NKF.guess
|
1999-08-13 01:37:52 -04:00
|
|
|
def guess(str)
|
2004-10-29 02:51:33 -04:00
|
|
|
::NKF::guess(str)
|
1999-08-13 01:37:52 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :guess
|
2004-10-29 02:51:33 -04:00
|
|
|
|
|
|
|
#
|
|
|
|
# isEncoding
|
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.iseuc(str) => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Returns whether input encoding is EUC-JP or not.
|
|
|
|
#
|
|
|
|
# *Note* don't expect this return value is MatchData.
|
2004-10-29 02:51:33 -04:00
|
|
|
def iseuc(str)
|
2008-01-01 10:22:25 -05:00
|
|
|
str.dup.force_encoding(EUC).valid_encoding?
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :iseuc
|
|
|
|
|
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.issjis(str) => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Returns whether input encoding is Shift_JIS or not.
|
2004-10-29 02:51:33 -04:00
|
|
|
def issjis(str)
|
2008-01-01 10:22:25 -05:00
|
|
|
str.dup.force_encoding(SJIS).valid_encoding?
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :issjis
|
2004-10-29 02:51:33 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# Kconv.isjis(str) => true or false
|
|
|
|
#
|
|
|
|
# Returns whether input encoding is ISO-2022-JP or not.
|
|
|
|
def isjis(str)
|
|
|
|
/\A [\t\n\r\x20-\x7E]*
|
|
|
|
(?:
|
|
|
|
(?:\x1b \x28 I [\x21-\x7E]*
|
|
|
|
|\x1b \x28 J [\x21-\x7E]*
|
|
|
|
|\x1b \x24 @ (?:[\x21-\x7E]{2})*
|
|
|
|
|\x1b \x24 B (?:[\x21-\x7E]{2})*
|
|
|
|
|\x1b \x24 \x28 D (?:[\x21-\x7E]{2})*
|
|
|
|
)*
|
|
|
|
\x1b \x28 B [\t\n\r\x20-\x7E]*
|
|
|
|
)*
|
2008-01-10 20:03:23 -05:00
|
|
|
\z/nox =~ str.dup.force_encoding('BINARY') ? true : false
|
2008-01-01 10:22:25 -05:00
|
|
|
end
|
|
|
|
module_function :isjis
|
|
|
|
|
|
|
|
# call-seq:
|
|
|
|
# Kconv.isutf8(str) => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Returns whether input encoding is UTF-8 or not.
|
2004-10-29 02:51:33 -04:00
|
|
|
def isutf8(str)
|
2008-01-01 10:22:25 -05:00
|
|
|
str.dup.force_encoding(UTF8).valid_encoding?
|
2004-10-29 02:51:33 -04:00
|
|
|
end
|
2006-03-27 08:48:21 -05:00
|
|
|
module_function :isutf8
|
|
|
|
end
|
|
|
|
|
|
|
|
class String
|
|
|
|
# call-seq:
|
2007-12-19 05:19:38 -05:00
|
|
|
# String#kconv(to_enc, from_enc)
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# Convert <code>self</code> to out_code.
|
|
|
|
# <code>out_code</code> and <code>in_code</code> are given as constants of Kconv.
|
2007-12-19 05:19:38 -05:00
|
|
|
def kconv(to_enc, from_enc=nil)
|
2007-12-23 04:37:51 -05:00
|
|
|
form_enc = self.encoding if !from_enc && self.encoding != Encoding.list[0]
|
2007-12-19 05:19:38 -05:00
|
|
|
Kconv::kconv(self, to_enc, from_enc)
|
2005-06-24 22:50:50 -04:00
|
|
|
end
|
2009-03-05 22:56:38 -05:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# to Encoding
|
|
|
|
#
|
2009-03-05 22:56:38 -05:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#tojis => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>self</code> to ISO-2022-JP
|
|
|
|
def tojis; Kconv.tojis(self) end
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#toeuc => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>self</code> to EUC-JP
|
|
|
|
def toeuc; Kconv.toeuc(self) end
|
2005-08-17 08:59:57 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#tosjis => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>self</code> to Shift_JIS
|
|
|
|
def tosjis; Kconv.tosjis(self) end
|
2005-08-17 08:59:57 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#toutf8 => string
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Convert <code>self</code> to UTF-8
|
|
|
|
def toutf8; Kconv.toutf8(self) end
|
2005-06-24 22:50:50 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#toutf16 => string
|
2005-06-24 22:50:50 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# Convert <code>self</code> to UTF-16
|
|
|
|
def toutf16; Kconv.toutf16(self) end
|
2005-08-17 08:59:57 -04:00
|
|
|
|
2006-09-15 07:26:07 -04:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#toutf32 => string
|
2006-09-15 07:26:07 -04:00
|
|
|
#
|
|
|
|
# Convert <code>self</code> to UTF-32
|
|
|
|
def toutf32; Kconv.toutf32(self) end
|
|
|
|
|
2008-01-01 10:22:25 -05:00
|
|
|
# call-seq:
|
|
|
|
# String#tolocale => string
|
|
|
|
#
|
|
|
|
# Convert <code>self</code> to locale encoding
|
|
|
|
def tolocale; Kconv.tolocale(self) end
|
|
|
|
|
2005-08-17 08:59:57 -04:00
|
|
|
#
|
2006-03-27 08:48:21 -05:00
|
|
|
# is Encoding
|
2005-08-17 08:59:57 -04:00
|
|
|
#
|
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#iseuc => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Returns whether <code>self</code>'s encoding is EUC-JP or not.
|
|
|
|
def iseuc; Kconv.iseuc(self) end
|
1999-10-29 05:25:48 -04:00
|
|
|
|
2006-03-27 08:48:21 -05:00
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#issjis => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
|
|
|
# Returns whether <code>self</code>'s encoding is Shift_JIS or not.
|
|
|
|
def issjis; Kconv.issjis(self) end
|
|
|
|
|
|
|
|
# call-seq:
|
2008-01-01 10:22:25 -05:00
|
|
|
# String#isjis => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
2008-01-01 10:22:25 -05:00
|
|
|
# Returns whether <code>self</code>'s encoding is ISO-2022-JP or not.
|
|
|
|
def isjis; Kconv.isjis(self) end
|
|
|
|
|
|
|
|
# call-seq:
|
|
|
|
# String#isutf8 => true or false
|
2006-03-27 08:48:21 -05:00
|
|
|
#
|
2008-01-01 10:22:25 -05:00
|
|
|
# Returns whether <code>self</code>'s encoding is UTF-8 or not.
|
2006-03-27 08:48:21 -05:00
|
|
|
def isutf8; Kconv.isutf8(self) end
|
1999-10-29 05:25:48 -04:00
|
|
|
end
|