1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* ext/nkf/nkf-utf8/{nkf.c, utf8tbl.c}: Update nkf.

* ext/nkf/nkf.c: fix documents.

* ext/nkf/lib/kconv.rb: fix documents.
  (Kconv.is*): use valid_encoding?.
  (Kconv.isjis): defined.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14833 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
naruse 2008-01-01 15:22:25 +00:00
parent 5f41f87d2b
commit 7d847f7c37
6 changed files with 801 additions and 867 deletions

View file

@ -1,3 +1,13 @@
Wed Jan 2 00:14:41 2008 NARUSE, Yui <naruse@ruby-lang.org>
* ext/nkf/nkf-utf8/{nkf.c, utf8tbl.c}: Update nkf.
* ext/nkf/nkf.c: fix documents.
* ext/nkf/lib/kconv.rb: fix documents.
(Kconv.is*): use valid_encoding?.
(Kconv.isjis): defined.
Tue Jan 1 23:17:03 2008 Tanaka Akira <akr@fsij.org>
* common.mk: dependency updated.

View file

@ -44,38 +44,6 @@ module Kconv
# UNKNOWN
UNKNOWN = NKF::UNKNOWN
#
#
# Private Constants
#
#Regexp of Encoding
# Regexp of Shift_JIS string (private constant)
RegexpShiftjis = /\A(?:
[\x00-\x7f\xa1-\xdf] |
[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]
)*\z/nx
# Regexp of EUC-JP string (private constant)
RegexpEucjp = /\A(?:
[\x00-\x7f] |
\x8e [\xa1-\xdf] |
\x8f [\xa1-\xfe] [\xa1-\xfe] |
[\xa1-\xfe] [\xa1-\xfe]
)*\z/nx
# Regexp of UTF-8 string (private constant)
RegexpUtf8 = /\A(?:
[\x00-\x7f] |
[\xc2-\xdf] [\x80-\xbf] |
\xe0 [\xa0-\xbf] [\x80-\xbf] |
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
)*\z/nx
#
# Public Methods
#
@ -99,7 +67,7 @@ module Kconv
#
# call-seq:
# Kconv.tojis(str) -> string
# Kconv.tojis(str) => string
#
# Convert <code>str</code> to ISO-2022-JP
def tojis(str)
@ -108,7 +76,7 @@ module Kconv
module_function :tojis
# call-seq:
# Kconv.toeuc(str) -> string
# Kconv.toeuc(str) => string
#
# Convert <code>str</code> to EUC-JP
def toeuc(str)
@ -117,7 +85,7 @@ module Kconv
module_function :toeuc
# call-seq:
# Kconv.tosjis(str) -> string
# Kconv.tosjis(str) => string
#
# Convert <code>str</code> to Shift_JIS
def tosjis(str)
@ -126,7 +94,7 @@ module Kconv
module_function :tosjis
# call-seq:
# Kconv.toutf8(str) -> string
# Kconv.toutf8(str) => string
#
# Convert <code>str</code> to UTF-8
def toutf8(str)
@ -135,7 +103,7 @@ module Kconv
module_function :toutf8
# call-seq:
# Kconv.toutf16(str) -> string
# Kconv.toutf16(str) => string
#
# Convert <code>str</code> to UTF-16
def toutf16(str)
@ -144,7 +112,7 @@ module Kconv
module_function :toutf16
# call-seq:
# Kconv.toutf32(str) -> string
# Kconv.toutf32(str) => string
#
# Convert <code>str</code> to UTF-32
def toutf32(str)
@ -152,12 +120,21 @@ module Kconv
end
module_function :toutf32
# call-seq:
# Kconv.tolocale => string
#
# Convert <code>self</code> to locale encoding
def tolocale
kconv(str, Encoding.locale_charmap)
end
module_function :tolocale
#
# guess
#
# call-seq:
# Kconv.guess(str) -> integer
# Kconv.guess(str) => encoding
#
# Guess input encoding by NKF.guess
def guess(str)
@ -170,38 +147,52 @@ module Kconv
#
# call-seq:
# Kconv.iseuc(str) -> obj or nil
# Kconv.iseuc(str) => true or false
#
# Returns whether input encoding is EUC-JP or not.
#
# *Note* don't expect this return value is MatchData.
def iseuc(str)
RegexpEucjp.match( str )
str.dup.force_encoding(EUC).valid_encoding?
end
module_function :iseuc
# call-seq:
# Kconv.issjis(str) -> obj or nil
# Kconv.issjis(str) => true or false
#
# Returns whether input encoding is Shift_JIS or not.
#
# *Note* don't expect this return value is MatchData.
def issjis(str)
RegexpShiftjis.match( str )
str.dup.force_encoding(SJIS).valid_encoding?
end
module_function :issjis
# call-seq:
# Kconv.isutf8(str) -> obj or nil
# Kconv.isjis(str) => true or false
#
# Returns whether input encoding is ISO-2022-JP or not.
def isjis(str)
/\A [\t\n\r\x20-\x7E]*
(?:
(?:\x1b \x28 I [\x21-\x7E]*
|\x1b \x28 J [\x21-\x7E]*
|\x1b \x24 @ (?:[\x21-\x7E]{2})*
|\x1b \x24 B (?:[\x21-\x7E]{2})*
|\x1b \x24 \x28 D (?:[\x21-\x7E]{2})*
)*
\x1b \x28 B [\t\n\r\x20-\x7E]*
)*
\z/nox =~ str.dup.force_encoding(nil) ? true : false
end
module_function :isjis
# call-seq:
# Kconv.isutf8(str) => true or false
#
# Returns whether input encoding is UTF-8 or not.
#
# *Note* don't expect this return value is MatchData.
def isutf8(str)
RegexpUtf8.match( str )
str.dup.force_encoding(UTF8).valid_encoding?
end
module_function :isutf8
end
class String
@ -220,66 +211,72 @@ class String
#
# call-seq:
# String#tojis -> string
# String#tojis => string
#
# Convert <code>self</code> to ISO-2022-JP
def tojis; Kconv.tojis(self) end
# call-seq:
# String#toeuc -> string
# String#toeuc => string
#
# Convert <code>self</code> to EUC-JP
def toeuc; Kconv.toeuc(self) end
# call-seq:
# String#tosjis -> string
# String#tosjis => string
#
# Convert <code>self</code> to Shift_JIS
def tosjis; Kconv.tosjis(self) end
# call-seq:
# String#toutf8 -> string
# String#toutf8 => string
#
# Convert <code>self</code> to UTF-8
def toutf8; Kconv.toutf8(self) end
# call-seq:
# String#toutf16 -> string
# String#toutf16 => string
#
# Convert <code>self</code> to UTF-16
def toutf16; Kconv.toutf16(self) end
# call-seq:
# String#toutf32 -> string
# String#toutf32 => string
#
# Convert <code>self</code> to UTF-32
def toutf32; Kconv.toutf32(self) end
# call-seq:
# String#tolocale => string
#
# Convert <code>self</code> to locale encoding
def tolocale; Kconv.tolocale(self) end
#
# is Encoding
#
# call-seq:
# String#iseuc -> obj or nil
# String#iseuc => true or false
#
# Returns whether <code>self</code>'s encoding is EUC-JP or not.
#
# *Note* don't expect this return value is MatchData.
def iseuc; Kconv.iseuc(self) end
# call-seq:
# String#issjis -> obj or nil
# String#issjis => true or false
#
# Returns whether <code>self</code>'s encoding is Shift_JIS or not.
#
# *Note* don't expect this return value is MatchData.
def issjis; Kconv.issjis(self) end
# call-seq:
# String#isutf8 -> obj or nil
# String#isjis => true or false
#
# Returns whether <code>self</code>'s encoding is ISO-2022-JP or not.
def isjis; Kconv.isjis(self) end
# call-seq:
# String#isutf8 => true or false
#
# Returns whether <code>self</code>'s encoding is UTF-8 or not.
#
# *Note* don't expect this return value is MatchData.
def isutf8; Kconv.isutf8(self) end
end

View file

@ -32,7 +32,7 @@
***********************************************************************/
/* $Id$ */
#define NKF_VERSION "2.0.8"
#define NKF_RELEASE_DATE "2007-12-23"
#define NKF_RELEASE_DATE "2007-01-02"
#define COPY_RIGHT \
"Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
"Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
@ -205,11 +205,12 @@ void djgpp_setbinmode(FILE *fp)
/* byte order */
#define ENDIAN_BIG 1234
#define ENDIAN_LITTLE 4321
#define ENDIAN_2143 2143
#define ENDIAN_3412 3412
enum byte_order {
ENDIAN_BIG = 1,
ENDIAN_LITTLE = 2,
ENDIAN_2143 = 3,
ENDIAN_3412 = 4
};
/* ASCII CODE */
@ -266,10 +267,10 @@ enum nkf_encodings {
UTF_32LE,
UTF_32LE_BOM,
JIS_X_0201=0x1000,
JIS_X_0208,
JIS_X_0212,
JIS_X_0213_1,
JIS_X_0213_2,
JIS_X_0208=0x1001,
JIS_X_0212=0x1002,
JIS_X_0213_1=0x1003,
JIS_X_0213_2=0x1004,
BINARY
};
@ -286,9 +287,9 @@ void w_oconv16(nkf_char c2, nkf_char c1);
void w_oconv32(nkf_char c2, nkf_char c1);
typedef struct {
char *name;
nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
void (*oconv_func)(nkf_char c2, nkf_char c1);
const char *name;
nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
void (*oconv)(nkf_char c2, nkf_char c1);
} nkf_native_encoding;
nkf_native_encoding NkfEncodingASCII = { "US_ASCII", e_iconv, e_oconv };
@ -300,21 +301,21 @@ nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
typedef struct {
int id;
char *name;
nkf_native_encoding *based_encoding;
const int id;
const char *name;
const nkf_native_encoding *base_encoding;
} nkf_encoding;
nkf_encoding nkf_encoding_table[] = {
{ASCII, "ASCII", &NkfEncodingASCII},
{ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
{ISO_2022_JP, "ISO-2022-JP", &NkfEncodingASCII},
{ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
{CP50220, "CP50220", &NkfEncodingISO_2022_JP},
{CP50221, "CP50221", &NkfEncodingISO_2022_JP},
{CP50222, "CP50222", &NkfEncodingISO_2022_JP},
{ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
{ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
{SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
{WINDOWS_31J, "WINDOWS-31J", &NkfEncodingShift_JIS},
{WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
{CP10001, "CP10001", &NkfEncodingShift_JIS},
{EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
{CP51932, "CP51932", &NkfEncodingEUC_JP},
@ -476,7 +477,8 @@ struct input_code{
};
static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
static nkf_encoding *output_encoding;
static nkf_encoding *input_encoding = NULL;
static nkf_encoding *output_encoding = NULL;
#if !defined(PERL_XS) && !defined(WIN32DLL)
static nkf_char noconvert(FILE *f);
@ -601,7 +603,6 @@ static int nop_f = FALSE;
static int binmode_f = TRUE; /* binary mode */
static int rot_f = FALSE; /* rot14/43 mode */
static int hira_f = FALSE; /* hira/kata henkan */
static int input_f = FALSE; /* non fixed input code */
static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
static int mime_decode_f = FALSE; /* mime decode is explicitly on */
@ -753,11 +754,8 @@ static int fold_margin = FOLD_MARGIN;
#endif
/* process default */
static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
/* s_iconv or oconv */
static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
@ -948,7 +946,20 @@ static nkf_encoding *nkf_enc_find(const char *name)
#define nkf_enc_name(enc) (enc)->name
#define nkf_enc_to_index(enc) (enc)->id
#define nkf_enc_to_base_encoding(enc) (enc)->based_encoding
#define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
#define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
#define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
#define nkf_enc_asciicompat(enc) (\
nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
#define nkf_enc_unicode_p(enc) (\
nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
#define nkf_enc_cp5022x_p(enc) (\
nkf_enc_to_index(enc) == CP50220 ||\
nkf_enc_to_index(enc) == CP50221 ||\
nkf_enc_to_index(enc) == CP50222)
#ifdef WIN32DLL
#include "nkf32dll.c"
@ -1294,7 +1305,7 @@ static const struct {
{"katakana","h2"},
{"katakana-hiragana","h3"},
{"guess=", ""},
{"guess", "g1"},
{"guess", "g2"},
{"cp932", ""},
{"no-cp932", ""},
#ifdef X0212_ENABLE
@ -1358,7 +1369,6 @@ void options(unsigned char *cp)
char codeset[32];
nkf_encoding *enc;
if (!output_encoding) output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
if (option_mode==1)
return;
while(*cp && *cp++!='-');
@ -1396,14 +1406,12 @@ void options(unsigned char *cp)
if (strcmp(long_option[i].name, "ic=") == 0){
nkf_str_upcase((char *)p, codeset, 32);
enc = nkf_enc_find(codeset);
switch (nkf_enc_to_index(enc)) {
case ISO_2022_JP:
input_f = JIS_INPUT;
break;
if (!enc) continue;
input_encoding = enc;
switch (nkf_enc_to_index(input_encoding)) {
case CP50220:
case CP50221:
case CP50222:
input_f = JIS_INPUT;
#ifdef SHIFTJIS_CP932
cp51932_f = TRUE;
#endif
@ -1412,23 +1420,17 @@ void options(unsigned char *cp)
#endif
break;
case ISO_2022_JP_1:
input_f = JIS_INPUT;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
break;
case ISO_2022_JP_3:
input_f = JIS_INPUT;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
x0213_f = TRUE;
break;
case SHIFT_JIS:
input_f = SJIS_INPUT;
break;
case WINDOWS_31J:
input_f = SJIS_INPUT;
#ifdef SHIFTJIS_CP932
cp51932_f = TRUE;
#endif
@ -1437,7 +1439,6 @@ void options(unsigned char *cp)
#endif
break;
case CP10001:
input_f = SJIS_INPUT;
#ifdef SHIFTJIS_CP932
cp51932_f = TRUE;
#endif
@ -1445,11 +1446,7 @@ void options(unsigned char *cp)
ms_ucs_map_f = UCS_MAP_CP10001;
#endif
break;
case EUC_JP:
input_f = EUC_INPUT;
break;
case CP51932:
input_f = EUC_INPUT;
#ifdef SHIFTJIS_CP932
cp51932_f = TRUE;
#endif
@ -1458,7 +1455,6 @@ void options(unsigned char *cp)
#endif
break;
case EUCJP_MS:
input_f = EUC_INPUT;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
#endif
@ -1467,7 +1463,6 @@ void options(unsigned char *cp)
#endif
break;
case EUCJP_ASCII:
input_f = EUC_INPUT;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
#endif
@ -1477,7 +1472,6 @@ void options(unsigned char *cp)
break;
case SHIFT_JISX0213:
case SHIFT_JIS_2004:
input_f = SJIS_INPUT;
x0213_f = TRUE;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
@ -1485,50 +1479,36 @@ void options(unsigned char *cp)
break;
case EUC_JISX0213:
case EUC_JIS_2004:
input_f = EUC_INPUT;
x0213_f = TRUE;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
#endif
break;
#ifdef UTF8_INPUT_ENABLE
case UTF_8:
case UTF_8N:
case UTF_8_BOM:
input_f = UTF8_INPUT;
break;
#ifdef UNICODE_NORMALIZATION
case UTF8_MAC:
input_f = UTF8_INPUT;
nfc_f = TRUE;
break;
#endif
case UTF_16:
case UTF_16BE:
case UTF_16BE_BOM:
input_f = UTF16_INPUT;
input_endian = ENDIAN_BIG;
break;
case UTF_16LE:
case UTF_16LE_BOM:
input_f = UTF16_INPUT;
input_endian = ENDIAN_LITTLE;
break;
case UTF_32:
case UTF_32BE:
case UTF_32BE_BOM:
input_f = UTF32_INPUT;
input_endian = ENDIAN_BIG;
break;
case UTF_32LE:
case UTF_32LE_BOM:
input_f = UTF32_INPUT;
input_endian = ENDIAN_LITTLE;
break;
#endif
default:
fprintf(stderr, "unknown input encoding: %s\n", codeset);
break;
}
continue;
}
@ -1539,11 +1519,7 @@ void options(unsigned char *cp)
if (enc <= 0) continue;
output_encoding = enc;
switch (nkf_enc_to_index(output_encoding)) {
case ISO_2022_JP:
output_conv = j_oconv;
break;
case CP50220:
output_conv = j_oconv;
x0201_f = TRUE;
#ifdef SHIFTJIS_CP932
cp932inv_f = FALSE;
@ -1553,7 +1529,6 @@ void options(unsigned char *cp)
#endif
break;
case CP50221:
output_conv = j_oconv;
#ifdef SHIFTJIS_CP932
cp932inv_f = FALSE;
#endif
@ -1562,7 +1537,6 @@ void options(unsigned char *cp)
#endif
break;
case ISO_2022_JP_1:
output_conv = j_oconv;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
@ -1571,7 +1545,6 @@ void options(unsigned char *cp)
#endif
break;
case ISO_2022_JP_3:
output_conv = j_oconv;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
@ -1580,26 +1553,17 @@ void options(unsigned char *cp)
cp932inv_f = FALSE;
#endif
break;
case SHIFT_JIS:
output_conv = s_oconv;
break;
case WINDOWS_31J:
output_conv = s_oconv;
#ifdef UTF8_OUTPUT_ENABLE
ms_ucs_map_f = UCS_MAP_CP932;
#endif
break;
case CP10001:
output_conv = s_oconv;
#ifdef UTF8_OUTPUT_ENABLE
ms_ucs_map_f = UCS_MAP_CP10001;
#endif
break;
case EUC_JP:
output_conv = e_oconv;
break;
case CP51932:
output_conv = e_oconv;
#ifdef SHIFTJIS_CP932
cp932inv_f = FALSE;
#endif
@ -1608,7 +1572,6 @@ void options(unsigned char *cp)
#endif
break;
case EUCJP_MS:
output_conv = e_oconv;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
@ -1617,7 +1580,6 @@ void options(unsigned char *cp)
#endif
break;
case EUCJP_ASCII:
output_conv = e_oconv;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
@ -1627,7 +1589,6 @@ void options(unsigned char *cp)
break;
case SHIFT_JISX0213:
case SHIFT_JIS_2004:
output_conv = s_oconv;
x0213_f = TRUE;
#ifdef SHIFTJIS_CP932
cp932inv_f = FALSE;
@ -1635,7 +1596,6 @@ void options(unsigned char *cp)
break;
case EUC_JISX0213:
case EUC_JIS_2004:
output_conv = e_oconv;
#ifdef X0212_ENABLE
x0212_f = TRUE;
#endif
@ -1645,60 +1605,41 @@ void options(unsigned char *cp)
#endif
break;
#ifdef UTF8_OUTPUT_ENABLE
case UTF_8:
case UTF_8N:
output_conv = w_oconv;
break;
case UTF_8_BOM:
output_conv = w_oconv;
output_bom_f = TRUE;
break;
case UTF_16BE:
output_conv = w_oconv16;
break;
case UTF_16:
case UTF_16BE_BOM:
output_conv = w_oconv16;
output_bom_f = TRUE;
break;
case UTF_16LE:
output_conv = w_oconv16;
output_endian = ENDIAN_LITTLE;
output_bom_f = FALSE;
break;
case UTF_16LE_BOM:
output_conv = w_oconv16;
output_endian = ENDIAN_LITTLE;
output_bom_f = TRUE;
break;
case UTF_32:
case UTF_32BE:
output_conv = w_oconv32;
break;
case UTF_32BE_BOM:
output_conv = w_oconv32;
output_bom_f = TRUE;
break;
case UTF_32LE:
output_conv = w_oconv32;
output_endian = ENDIAN_LITTLE;
output_bom_f = FALSE;
break;
case UTF_32LE_BOM:
output_conv = w_oconv32;
output_endian = ENDIAN_LITTLE;
output_bom_f = TRUE;
break;
#endif
default:
fprintf(stderr, "unknown output encoding: %s\n", codeset);
break;
}
continue;
}
if (strcmp(long_option[i].name, "guess=") == 0){
if (p[0] == '1') {
guess_f = 2;
} else {
if (p[0] == '0' || p[0] == '1') {
guess_f = 1;
} else {
guess_f = 2;
}
continue;
}
@ -1872,7 +1813,6 @@ void options(unsigned char *cp)
#endif
#ifdef UNICODE_NORMALIZATION
if (strcmp(long_option[i].name, "utf8mac-input") == 0){
input_f = UTF8_INPUT;
nfc_f = TRUE;
continue;
}
@ -1912,21 +1852,18 @@ void options(unsigned char *cp)
continue;
case 'j': /* JIS output */
case 'n':
output_conv = j_oconv;
output_encoding = nkf_enc_from_index(ISO_2022_JP);
continue;
case 'e': /* AT&T EUC output */
output_conv = e_oconv;
cp932inv_f = FALSE;
output_encoding = nkf_enc_from_index(EUC_JP);
continue;
case 's': /* SJIS output */
output_conv = s_oconv;
output_encoding = nkf_enc_from_index(SHIFT_JIS);
output_encoding = nkf_enc_from_index(WINDOWS_31J);
continue;
case 'l': /* ISO8859 Latin-1 support, no conversion */
iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
input_f = LATIN1_INPUT;
input_encoding = nkf_enc_from_index(ISO_8859_1);
continue;
case 'i': /* Kanji IN ESC-$-@/B */
if (*cp=='@'||*cp=='B')
@ -1967,7 +1904,7 @@ void options(unsigned char *cp)
#ifdef UTF8_OUTPUT_ENABLE
case 'w': /* UTF-8 output */
if (cp[0] == '8') {
output_conv = w_oconv; cp++;
cp++;
if (cp[0] == '0'){
cp++;
output_encoding = nkf_enc_from_index(UTF_8N);
@ -1978,13 +1915,12 @@ void options(unsigned char *cp)
} else {
int enc_idx;
if ('1'== cp[0] && '6'==cp[1]) {
output_conv = w_oconv16; cp+=2;
cp += 2;
enc_idx = UTF_16;
} else if ('3'== cp[0] && '2'==cp[1]) {
output_conv = w_oconv32; cp+=2;
cp += 2;
enc_idx = UTF_32;
} else {
output_conv = w_oconv;
output_encoding = nkf_enc_from_index(UTF_8);
continue;
}
@ -2016,18 +1952,19 @@ void options(unsigned char *cp)
case 'W': /* UTF input */
if (cp[0] == '8') {
cp++;
input_f = UTF8_INPUT;
input_encoding = nkf_enc_from_index(UTF_8);
}else{
int enc_idx;
if ('1'== cp[0] && '6'==cp[1]) {
cp += 2;
input_f = UTF16_INPUT;
input_endian = ENDIAN_BIG;
enc_idx = UTF_16;
} else if ('3'== cp[0] && '2'==cp[1]) {
cp += 2;
input_f = UTF32_INPUT;
input_endian = ENDIAN_BIG;
enc_idx = UTF_32;
} else {
input_f = UTF8_INPUT;
input_encoding = nkf_enc_from_index(UTF_8);
continue;
}
if (cp[0]=='L') {
@ -2035,19 +1972,24 @@ void options(unsigned char *cp)
input_endian = ENDIAN_LITTLE;
} else if (cp[0] == 'B') {
cp++;
input_endian = ENDIAN_BIG;
}
enc_idx = enc_idx == UTF_16
? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
: (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
input_encoding = nkf_enc_from_index(enc_idx);
}
continue;
#endif
/* Input code assumption */
case 'J': /* JIS input */
input_f = JIS_INPUT;
case 'J': /* ISO-2022-JP input */
input_encoding = nkf_enc_from_index(ISO_2022_JP);
continue;
case 'E': /* AT&T EUC input */
input_f = EUC_INPUT;
case 'E': /* EUC-JP input */
input_encoding = nkf_enc_from_index(EUC_JP);
continue;
case 'S': /* MS Kanji input */
input_f = SJIS_INPUT;
case 'S': /* Windows-31J input */
input_encoding = nkf_enc_from_index(WINDOWS_31J);
continue;
case 'Z': /* Convert X0208 alphabet to asii */
/* alpha_f
@ -2160,10 +2102,10 @@ void options(unsigned char *cp)
continue;
#ifndef PERL_XS
case 'g':
if (*cp == '1') {
if ('2' <= *cp && *cp <= '9') {
guess_f = 2;
cp++;
} else if (*cp == '0') {
} else if (*cp == '0' || *cp == '1') {
guess_f = 1;
cp++;
} else {
@ -2200,7 +2142,7 @@ struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf
void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
{
#ifdef INPUT_CODE_FIX
if (f || !input_f)
if (f || !input_encoding)
#endif
if (estab_f != f){
estab_f = f;
@ -2208,7 +2150,7 @@ void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_ch
if (iconv_func
#ifdef INPUT_CODE_FIX
&& (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
&& (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
#endif
){
iconv = iconv_func;
@ -2588,7 +2530,8 @@ nkf_char noconvert(FILE *f)
void module_connection(void)
{
oconv = output_conv;
if (!output_encoding) output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
oconv = nkf_enc_to_oconv(output_encoding);
o_putc = std_putc;
/* replace continucation module, from output side */
@ -2648,7 +2591,7 @@ void module_connection(void)
}
#endif
#ifdef UNICODE_NORMALIZATION
if (nfc_f && input_f == UTF8_INPUT){
if (nfc_f){
i_nfc_getc = i_getc; i_getc = nfc_getc;
i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
}
@ -2661,18 +2604,8 @@ void module_connection(void)
i_bgetc = i_getc; i_getc = broken_getc;
i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
}
if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
set_iconv(-TRUE, e_iconv);
} else if (input_f == SJIS_INPUT) {
set_iconv(-TRUE, s_iconv);
#ifdef UTF8_INPUT_ENABLE
} else if (input_f == UTF8_INPUT) {
set_iconv(-TRUE, w_iconv);
} else if (input_f == UTF16_INPUT) {
set_iconv(-TRUE, w_iconv16);
} else if (input_f == UTF32_INPUT) {
set_iconv(-TRUE, w_iconv32);
#endif
if (input_encoding) {
set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
} else {
set_iconv(FALSE, e_iconv);
}
@ -2696,7 +2629,7 @@ void check_bom(FILE *f)
if((c2 = (*i_getc)(f)) == 0x00){
if((c2 = (*i_getc)(f)) == 0xFE){
if((c2 = (*i_getc)(f)) == 0xFF){
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv32);
}
if (iconv == w_iconv32) {
@ -2708,7 +2641,7 @@ void check_bom(FILE *f)
(*i_ungetc)(0xFE,f);
}else if(c2 == 0xFF){
if((c2 = (*i_getc)(f)) == 0xFE){
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv32);
}
if (iconv == w_iconv32) {
@ -2726,7 +2659,7 @@ void check_bom(FILE *f)
case 0xEF:
if((c2 = (*i_getc)(f)) == 0xBB){
if((c2 = (*i_getc)(f)) == 0xBF){
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv);
}
if (iconv == w_iconv) {
@ -2742,7 +2675,7 @@ void check_bom(FILE *f)
if((c2 = (*i_getc)(f)) == 0xFF){
if((c2 = (*i_getc)(f)) == 0x00){
if((c2 = (*i_getc)(f)) == 0x00){
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv32);
}
if (iconv == w_iconv32) {
@ -2753,7 +2686,7 @@ void check_bom(FILE *f)
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv16);
}
if (iconv == w_iconv16) {
@ -2768,7 +2701,7 @@ void check_bom(FILE *f)
if((c2 = (*i_getc)(f)) == 0xFE){
if((c2 = (*i_getc)(f)) == 0x00){
if((c2 = (*i_getc)(f)) == 0x00){
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv32);
}
if (iconv == w_iconv32) {
@ -2779,7 +2712,7 @@ void check_bom(FILE *f)
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
if(!input_encoding){
set_iconv(TRUE, w_iconv16);
}
if (iconv == w_iconv16) {
@ -2805,11 +2738,7 @@ nkf_char kanji_convert(FILE *f)
nkf_char c3, c2=0, c1, c0=0;
int is_8bit = FALSE;
if(input_f == SJIS_INPUT || input_f == EUC_INPUT
#ifdef UTF8_INPUT_ENABLE
|| input_f == UTF8_INPUT || input_f == UTF16_INPUT
#endif
){
if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
is_8bit = TRUE;
}
@ -2826,12 +2755,12 @@ nkf_char kanji_convert(FILE *f)
while ((c1 = (*i_getc)(f)) != EOF) {
#ifdef INPUT_CODE_FIX
if (!input_f)
if (!input_encoding)
#endif
code_status(c1);
if (c2) {
/* second byte */
if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
/* in case of 8th bit is on */
if (!estab_f&&!mime_decode_mode) {
/* in case of not established yet */
@ -2920,7 +2849,7 @@ nkf_char kanji_convert(FILE *f)
SEND;
} else
#endif
if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
if (c1 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
/* 8 bit code */
if (!estab_f && !iso8859_f) {
/* not established yet */
@ -5019,7 +4948,7 @@ void hira_conv(nkf_char c2, nkf_char c1)
c2 = 0x24;
(*o_hira_conv)(c2,c1);
return;
} else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
} else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
c2 = 0;
c1 = CLASS_UNICODE | 0x3094;
(*o_hira_conv)(c2,c1);
@ -6252,7 +6181,6 @@ void reinit(void)
binmode_f = TRUE;
rot_f = FALSE;
hira_f = FALSE;
input_f = FALSE;
alpha_f = FALSE;
mime_f = MIME_DECODE_DEFAULT;
mime_decode_f = FALSE;
@ -6318,7 +6246,6 @@ void reinit(void)
kanji_intro = DEFAULT_J;
ascii_intro = DEFAULT_R;
fold_margin = FOLD_MARGIN;
output_conv = DEFAULT_CONV;
oconv = DEFAULT_CONV;
o_zconv = no_connection;
o_fconv = no_connection;
@ -6353,6 +6280,7 @@ void reinit(void)
iconv_for_check = 0;
#endif
input_codename = NULL;
input_encoding = NULL;
output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
#ifdef WIN32DLL
reinitdll();

File diff suppressed because it is too large Load diff

View file

@ -66,8 +66,7 @@ rb_encoding* rb_nkf_enc_get(const char *name)
int idx = rb_enc_find_index(name);
if (idx < 0) {
nkf_encoding *nkf_enc = nkf_enc_find(name);
nkf_native_encoding * nkf_base_enc = nkf_enc_to_base_encoding(nkf_enc);
idx = rb_enc_find_index(nkf_enc_name(nkf_base_enc));
idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc)));
if (idx < 0) {
idx = rb_define_dummy_encoding(name);
} else {
@ -127,7 +126,7 @@ int nkf_split_options(const char *arg)
/*
* call-seq:
* NKF.nkf(opt, str) -> string
* NKF.nkf(opt, str) => string
*
* Convert _str_ and return converted result.
* Conversion details are specified by _opt_ as String.
@ -176,7 +175,7 @@ rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
/*
* call-seq:
* NKF.guess(str) -> encoding
* NKF.guess(str) => encoding
*
* Returns guessed encoding of _str_ by nkf routine.
*
@ -402,7 +401,7 @@ rb_nkf_guess(VALUE obj, VALUE src)
*
* [Shift_JIS] SJIS, MS-Kanji
*
* [CP932] a.k.a. Windows-31J
* [Windows-31J] a.k.a. CP932
*
* [UTF-8] same as UTF-8N
*

View file

@ -25,7 +25,7 @@ class TestKconv < Test::Unit::TestCase
def test_eucjp
assert(@euc_str.dup.force_encoding(nil).iseuc)
assert(@euc_str.iseuc)
assert_equal(::Kconv::EUC, Kconv.guess(@euc_str))
assert_equal(@euc_str, @euc_str.toeuc)
assert_equal(@euc_str, @sjis_str.toeuc)
@ -37,7 +37,7 @@ class TestKconv < Test::Unit::TestCase
assert_equal(@euc_str, @jis_str.kconv(::NKF::EUC))
end
def test_shiftjis
assert(@sjis_str.dup.force_encoding(nil).issjis)
assert(@sjis_str.issjis)
assert_equal(::Kconv::SJIS, Kconv.guess(@sjis_str))
assert_equal(@sjis_str, @euc_str.tosjis)
assert_equal(@sjis_str, @sjis_str.tosjis)
@ -49,7 +49,7 @@ class TestKconv < Test::Unit::TestCase
assert_equal(@sjis_str, @jis_str.kconv(::NKF::SJIS))
end
def test_utf8
assert(@utf8_str.dup.force_encoding(nil).isutf8)
assert(@utf8_str.isutf8)
assert_equal(::Kconv::UTF8, Kconv.guess(@utf8_str))
assert_equal(@utf8_str, @euc_str.toutf8)
assert_equal(@utf8_str, @sjis_str.toutf8)