mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.
* Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32. * Add support --ic=UTF-32. * Fix: can't guess UTF-16 and UTF-32. * Fix: can't decode beyond BMP of UTF-16LE. * ext/nkf/nkf.c (guess): Support UTF-32. * ext/nkf/lib/kconv.rb (kconv): Support UTF-32. * ext/nkf/lib/kconv.rb (to_utf32): new method. * ext/nkf/lib/kconv.rb (to_utf32): new method. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@10938 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
976b4e5f8b
commit
5300eecfb3
4 changed files with 131 additions and 31 deletions
14
ChangeLog
14
ChangeLog
|
|
@ -1,3 +1,17 @@
|
||||||
|
Fri Sep 15 20:22:15 2006 NARUSE, Yui <naruse@ruby-lang.org>
|
||||||
|
|
||||||
|
* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.
|
||||||
|
* Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32LE.
|
||||||
|
* Add support --ic=UTF-32.
|
||||||
|
* Fix: can't guess UTF-16 and UTF-32.
|
||||||
|
* Fix: can't decode beyond BMP of UTF-16LE.
|
||||||
|
|
||||||
|
* ext/nkf/nkf.c (guess): Support UTF-32.
|
||||||
|
|
||||||
|
* ext/nkf/lib/kconv.rb (kconv): Support UTF-32.
|
||||||
|
|
||||||
|
* ext/nkf/lib/kconv.rb (to_utf32): new method.
|
||||||
|
|
||||||
Fri Sep 15 05:23:24 2006 NARUSE, Yui <naruse@ruby-lang.org>
|
Fri Sep 15 05:23:24 2006 NARUSE, Yui <naruse@ruby-lang.org>
|
||||||
|
|
||||||
* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 2006-09-15.
|
* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 2006-09-15.
|
||||||
|
|
|
||||||
|
|
@ -105,6 +105,8 @@ module Kconv
|
||||||
opt << 'W'
|
opt << 'W'
|
||||||
when ::NKF::UTF16
|
when ::NKF::UTF16
|
||||||
opt << 'W16'
|
opt << 'W16'
|
||||||
|
when ::NKF::UTF32
|
||||||
|
opt << 'W32'
|
||||||
end
|
end
|
||||||
|
|
||||||
case out_code
|
case out_code
|
||||||
|
|
@ -118,6 +120,8 @@ module Kconv
|
||||||
opt << 'w'
|
opt << 'w'
|
||||||
when ::NKF::UTF16
|
when ::NKF::UTF16
|
||||||
opt << 'w16'
|
opt << 'w16'
|
||||||
|
when ::NKF::UTF32
|
||||||
|
opt << 'w32'
|
||||||
when ::NKF::NOCONV
|
when ::NKF::NOCONV
|
||||||
return str
|
return str
|
||||||
end
|
end
|
||||||
|
|
@ -202,6 +206,20 @@ module Kconv
|
||||||
end
|
end
|
||||||
module_function :toutf16
|
module_function :toutf16
|
||||||
|
|
||||||
|
# call-seq:
|
||||||
|
# Kconv.toutf32(str) -> string
|
||||||
|
#
|
||||||
|
# Convert <code>str</code> to UTF-32
|
||||||
|
#
|
||||||
|
# *Note*
|
||||||
|
# This method decode MIME encoded string and
|
||||||
|
# convert halfwidth katakana to fullwidth katakana.
|
||||||
|
# If you don't want it, use NKF.nkf('-w32xm0', str).
|
||||||
|
def toutf32(str)
|
||||||
|
::NKF::nkf('-w32m', str)
|
||||||
|
end
|
||||||
|
module_function :toutf32
|
||||||
|
|
||||||
#
|
#
|
||||||
# guess
|
# guess
|
||||||
#
|
#
|
||||||
|
|
@ -337,6 +355,17 @@ class String
|
||||||
# If you don't want it, use NKF.nkf('-w16xm0', str).
|
# If you don't want it, use NKF.nkf('-w16xm0', str).
|
||||||
def toutf16; Kconv.toutf16(self) end
|
def toutf16; Kconv.toutf16(self) end
|
||||||
|
|
||||||
|
# call-seq:
|
||||||
|
# String#toutf32 -> string
|
||||||
|
#
|
||||||
|
# Convert <code>self</code> to UTF-32
|
||||||
|
#
|
||||||
|
# *Note*
|
||||||
|
# This method decode MIME encoded string and
|
||||||
|
# convert halfwidth katakana to fullwidth katakana.
|
||||||
|
# If you don't want it, use NKF.nkf('-w32xm0', str).
|
||||||
|
def toutf32; Kconv.toutf32(self) end
|
||||||
|
|
||||||
#
|
#
|
||||||
# is Encoding
|
# is Encoding
|
||||||
#
|
#
|
||||||
|
|
|
||||||
|
|
@ -581,6 +581,8 @@ struct input_code input_code_list[] = {
|
||||||
{"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
|
{"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
|
||||||
#ifdef UTF8_INPUT_ENABLE
|
#ifdef UTF8_INPUT_ENABLE
|
||||||
{"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
|
{"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
|
||||||
|
{"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
|
||||||
|
{"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
|
||||||
#endif
|
#endif
|
||||||
{0}
|
{0}
|
||||||
};
|
};
|
||||||
|
|
@ -1293,6 +1295,15 @@ void options(unsigned char *cp)
|
||||||
strcmp(codeset, "UTF-16LE-BOM") == 0){
|
strcmp(codeset, "UTF-16LE-BOM") == 0){
|
||||||
input_f = UTF16_INPUT;
|
input_f = UTF16_INPUT;
|
||||||
input_endian = ENDIAN_LITTLE;
|
input_endian = ENDIAN_LITTLE;
|
||||||
|
}else if(strcmp(codeset, "UTF-32") == 0 ||
|
||||||
|
strcmp(codeset, "UTF-32BE") == 0 ||
|
||||||
|
strcmp(codeset, "UTF-32BE-BOM") == 0){
|
||||||
|
input_f = UTF32_INPUT;
|
||||||
|
input_endian = ENDIAN_BIG;
|
||||||
|
}else if(strcmp(codeset, "UTF-32LE") == 0 ||
|
||||||
|
strcmp(codeset, "UTF-32LE-BOM") == 0){
|
||||||
|
input_f = UTF32_INPUT;
|
||||||
|
input_endian = ENDIAN_LITTLE;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -1901,12 +1912,7 @@ void options(unsigned char *cp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ANSI_C_PROTOTYPE
|
|
||||||
struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
|
struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
|
||||||
#else
|
|
||||||
struct input_code * find_inputcode_byfunc(iconv_func)
|
|
||||||
nkf_char (*iconv_func)();
|
|
||||||
#endif
|
|
||||||
{
|
{
|
||||||
if (iconv_func){
|
if (iconv_func){
|
||||||
struct input_code *p = input_code_list;
|
struct input_code *p = input_code_list;
|
||||||
|
|
@ -2227,6 +2233,12 @@ void code_status(nkf_char c)
|
||||||
struct input_code *result = 0;
|
struct input_code *result = 0;
|
||||||
struct input_code *p = input_code_list;
|
struct input_code *p = input_code_list;
|
||||||
while (p->name){
|
while (p->name){
|
||||||
|
if (!p->status_func) {
|
||||||
|
++p;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!p->status_func)
|
||||||
|
continue;
|
||||||
(p->status_func)(p, c);
|
(p->status_func)(p, c);
|
||||||
if (p->stat > 0){
|
if (p->stat > 0){
|
||||||
action_flag = 0;
|
action_flag = 0;
|
||||||
|
|
@ -2407,8 +2419,11 @@ void check_bom(FILE *f)
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv32);
|
set_iconv(TRUE, w_iconv32);
|
||||||
}
|
}
|
||||||
input_endian = ENDIAN_BIG;
|
if (iconv == w_iconv32) {
|
||||||
return;
|
input_endian = ENDIAN_BIG;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0xFF,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0xFE,f);
|
(*i_ungetc)(0xFE,f);
|
||||||
}else if(c2 == 0xFF){
|
}else if(c2 == 0xFF){
|
||||||
|
|
@ -2416,8 +2431,11 @@ void check_bom(FILE *f)
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv32);
|
set_iconv(TRUE, w_iconv32);
|
||||||
}
|
}
|
||||||
input_endian = ENDIAN_2143;
|
if (iconv == w_iconv32) {
|
||||||
return;
|
input_endian = ENDIAN_2143;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0xFF,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0xFF,f);
|
(*i_ungetc)(0xFF,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
|
|
@ -2431,7 +2449,10 @@ void check_bom(FILE *f)
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv);
|
set_iconv(TRUE, w_iconv);
|
||||||
}
|
}
|
||||||
return;
|
if (iconv == w_iconv) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0xBF,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0xBB,f);
|
(*i_ungetc)(0xBB,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
|
|
@ -2444,16 +2465,22 @@ void check_bom(FILE *f)
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv32);
|
set_iconv(TRUE, w_iconv32);
|
||||||
}
|
}
|
||||||
input_endian = ENDIAN_3412;
|
if (iconv == w_iconv32) {
|
||||||
return;
|
input_endian = ENDIAN_3412;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0x00,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0x00,f);
|
(*i_ungetc)(0x00,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv16);
|
set_iconv(TRUE, w_iconv16);
|
||||||
}
|
}
|
||||||
input_endian = ENDIAN_BIG;
|
if (iconv == w_iconv16) {
|
||||||
return;
|
input_endian = ENDIAN_BIG;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0xFF,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0xFE,f);
|
(*i_ungetc)(0xFE,f);
|
||||||
break;
|
break;
|
||||||
|
|
@ -2464,16 +2491,22 @@ void check_bom(FILE *f)
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv32);
|
set_iconv(TRUE, w_iconv32);
|
||||||
}
|
}
|
||||||
input_endian = ENDIAN_LITTLE;
|
if (iconv == w_iconv32) {
|
||||||
return;
|
input_endian = ENDIAN_LITTLE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0x00,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0x00,f);
|
(*i_ungetc)(0x00,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
if(!input_f){
|
if(!input_f){
|
||||||
set_iconv(TRUE, w_iconv16);
|
set_iconv(TRUE, w_iconv16);
|
||||||
}
|
}
|
||||||
input_endian = ENDIAN_LITTLE;
|
if (iconv == w_iconv16) {
|
||||||
return;
|
input_endian = ENDIAN_LITTLE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*i_ungetc)(0xFE,f);
|
||||||
}else (*i_ungetc)(c2,f);
|
}else (*i_ungetc)(c2,f);
|
||||||
(*i_ungetc)(0xFF,f);
|
(*i_ungetc)(0xFF,f);
|
||||||
break;
|
break;
|
||||||
|
|
@ -2557,21 +2590,21 @@ nkf_char kanji_convert(FILE *f)
|
||||||
c0 <<= 8;
|
c0 <<= 8;
|
||||||
if ((c3 = (*i_getc)(f)) != EOF) {
|
if ((c3 = (*i_getc)(f)) != EOF) {
|
||||||
c0 |= c3;
|
c0 |= c3;
|
||||||
} else c1 = EOF;
|
} else c2 = EOF;
|
||||||
} else c1 = EOF;
|
} else c2 = EOF;
|
||||||
}
|
}
|
||||||
}
|
} else c2 = EOF;
|
||||||
} else {
|
} else {
|
||||||
if ((c2 = (*i_getc)(f)) != EOF) {
|
if ((c2 = (*i_getc)(f)) != EOF) {
|
||||||
if (0xD8 <= c2 && c2 <= 0xDB) {
|
if (0xD8 <= c2 && c2 <= 0xDB) {
|
||||||
if ((c3 = (*i_getc)(f)) != EOF) {
|
if ((c3 = (*i_getc)(f)) != EOF) {
|
||||||
c3 <<= 8;
|
|
||||||
if ((c0 = (*i_getc)(f)) != EOF) {
|
if ((c0 = (*i_getc)(f)) != EOF) {
|
||||||
|
c0 <<= 8;
|
||||||
c0 |= c3;
|
c0 |= c3;
|
||||||
} else c1 = EOF;
|
} else c2 = EOF;
|
||||||
} else c1 = EOF;
|
} else c2 = EOF;
|
||||||
}
|
}
|
||||||
} else c1 = EOF;
|
} else c2 = EOF;
|
||||||
}
|
}
|
||||||
SEND;
|
SEND;
|
||||||
} else if(iconv == w_iconv32){
|
} else if(iconv == w_iconv32){
|
||||||
|
|
@ -2595,7 +2628,7 @@ nkf_char kanji_convert(FILE *f)
|
||||||
}
|
}
|
||||||
c2 = 0;
|
c2 = 0;
|
||||||
}else{
|
}else{
|
||||||
c1 = EOF;
|
c2 = EOF;
|
||||||
}
|
}
|
||||||
SEND;
|
SEND;
|
||||||
} else
|
} else
|
||||||
|
|
|
||||||
|
|
@ -306,6 +306,8 @@ rb_nkf_guess1(VALUE obj, VALUE src)
|
||||||
* "UTF-8"
|
* "UTF-8"
|
||||||
* when NKF::UTF16
|
* when NKF::UTF16
|
||||||
* "UTF-16"
|
* "UTF-16"
|
||||||
|
* when NKF::UTF32
|
||||||
|
* "UTF-32"
|
||||||
* when NKF::UNKNOWN
|
* when NKF::UNKNOWN
|
||||||
* "UNKNOWN"
|
* "UNKNOWN"
|
||||||
* when NKF::BINARY
|
* when NKF::BINARY
|
||||||
|
|
@ -345,6 +347,8 @@ rb_nkf_guess2(VALUE obj, VALUE src)
|
||||||
code = _UTF8;
|
code = _UTF8;
|
||||||
} else if (strcmp(input_codename, "UTF-16") == 0) {
|
} else if (strcmp(input_codename, "UTF-16") == 0) {
|
||||||
code = _UTF16;
|
code = _UTF16;
|
||||||
|
} else if (strcmp(input_codename, "UTF-32") == 0) {
|
||||||
|
code = _UTF32;
|
||||||
} else if (strlen(input_codename) > 0) {
|
} else if (strlen(input_codename) > 0) {
|
||||||
code = _UNKNOWN;
|
code = _UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
@ -382,16 +386,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
|
||||||
*
|
*
|
||||||
* Output is buffered (DEFAULT), Output is unbuffered.
|
* Output is buffered (DEFAULT), Output is unbuffered.
|
||||||
*
|
*
|
||||||
* === -j -s -e -w -w16
|
* === -j -s -e -w -w16 -w32
|
||||||
*
|
*
|
||||||
* Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
|
* Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
|
||||||
* UTF-8N, UTF-16BE.
|
* UTF-8N, UTF-16BE, UTF-32BE.
|
||||||
* Without this option and compile option, ISO-2022-JP is assumed.
|
* Without this option and compile option, ISO-2022-JP is assumed.
|
||||||
*
|
*
|
||||||
* === -J -S -E -W -W16
|
* === -J -S -E -W -W16 -W32
|
||||||
*
|
*
|
||||||
* Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
|
* Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
|
||||||
* UTF-8, UTF-16LE.
|
* UTF-8, UTF-16, UTF-32.
|
||||||
*
|
*
|
||||||
* ==== -J
|
* ==== -J
|
||||||
*
|
*
|
||||||
|
|
@ -574,6 +578,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
|
||||||
*
|
*
|
||||||
* [UTF-16LE-BOM] UTF-16 Little Endian with BOM
|
* [UTF-16LE-BOM] UTF-16 Little Endian with BOM
|
||||||
*
|
*
|
||||||
|
* [UTF-32] same as UTF-32BE
|
||||||
|
*
|
||||||
|
* [UTF-32BE] UTF-32 Big Endian without BOM
|
||||||
|
*
|
||||||
|
* [UTF-32BE-BOM] UTF-32 Big Endian with BOM
|
||||||
|
*
|
||||||
|
* [UTF-32LE] UTF-32 Little Endian without BOM
|
||||||
|
*
|
||||||
|
* [UTF-32LE-BOM] UTF-32 Little Endian with BOM
|
||||||
|
*
|
||||||
* [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
|
* [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
|
||||||
*
|
*
|
||||||
* === --fb-{skip, html, xml, perl, java, subchar}
|
* === --fb-{skip, html, xml, perl, java, subchar}
|
||||||
|
|
@ -587,10 +601,20 @@ rb_nkf_guess2(VALUE obj, VALUE src)
|
||||||
* nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
|
* nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
|
||||||
* 1st byte of argument is the escape character and following bytes are target characters.
|
* 1st byte of argument is the escape character and following bytes are target characters.
|
||||||
*
|
*
|
||||||
* === --disable-cp932ext
|
* === --no-cp932ext
|
||||||
*
|
*
|
||||||
* Handle the characters extended in CP932 as unassigned characters.
|
* Handle the characters extended in CP932 as unassigned characters.
|
||||||
*
|
*
|
||||||
|
* == --no-best-fit-chars
|
||||||
|
*
|
||||||
|
* When Unicode to Encoded byte conversion,
|
||||||
|
* don't convert characters which is not round trip safe.
|
||||||
|
* When Unicode to Unicode conversion,
|
||||||
|
* with this and -x option, nkf can be used as UTF converter.
|
||||||
|
* (In other words, without this and -x option, nkf doesn't save some characters)
|
||||||
|
*
|
||||||
|
* When nkf convert string which related to path, you should use this opion.
|
||||||
|
*
|
||||||
* === --cap-input
|
* === --cap-input
|
||||||
*
|
*
|
||||||
* Decode hex encoded characters.
|
* Decode hex encoded characters.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue