mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* Merge Onigmo-5.13.1. [ruby-dev:45057] [Feature #5820]
https://github.com/k-takata/Onigmo cp reg{comp,enc,error,exec,parse,syntax}.c reg{enc,int,parse}.h cp oniguruma.h cp tool/enc-unicode.rb cp -r enc/ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@34663 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
5362e7fcdd
commit
0424e152c6
54 changed files with 16729 additions and 8086 deletions
14
ChangeLog
14
ChangeLog
|
@ -1,3 +1,17 @@
|
|||
Fri Feb 17 15:38:53 2012 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* Merge Onigmo-5.13.1. [ruby-dev:45057] [Feature #5820]
|
||||
https://github.com/k-takata/Onigmo
|
||||
cp reg{comp,enc,error,exec,parse,syntax}.c reg{enc,int,parse}.h
|
||||
cp oniguruma.h
|
||||
cp tool/enc-unicode.rb
|
||||
cp -r enc/
|
||||
|
||||
Fri Feb 17 12:35:55 2012 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* tool/merger.rb: remove borders from the commit message which is used
|
||||
when the commit doesn't change ChangeLog.
|
||||
|
||||
Fri Feb 17 15:20:30 2012 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||
|
||||
* enum.c (enum_each_slice): arrays to be yielded can be newly
|
||||
|
|
3
NEWS
3
NEWS
|
@ -72,6 +72,9 @@ with all sufficient information, see the ChangeLog file.
|
|||
|
||||
See above.
|
||||
|
||||
* Merge Onigmo.
|
||||
https://github.com/k-takata/Onigmo
|
||||
|
||||
* incompatible changes:
|
||||
The :close_others option is true by default for system() and exec().
|
||||
Also, the close-on-exec flag is set by default for all new file descriptors.
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
/**********************************************************************
|
||||
ascii.c - Oniguruma (regular expression library)
|
||||
ascii.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -45,7 +46,9 @@ OnigEncodingDefine(ascii, ASCII) = {
|
|||
onigenc_ascii_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("BINARY", "ASCII-8BIT")
|
||||
ENC_REPLICATE("IBM437", "ASCII-8BIT")
|
||||
|
|
12
enc/big5.c
12
enc/big5.c
|
@ -299,7 +299,9 @@ OnigEncodingDefine(big5, BIG5) = {
|
|||
big5_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
big5_left_adjust_char_head,
|
||||
big5_is_allowed_reverse_match
|
||||
big5_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -331,7 +333,9 @@ OnigEncodingDefine(big5_hkscs, BIG5_HKSCS) = {
|
|||
big5_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
big5_left_adjust_char_head,
|
||||
big5_is_allowed_reverse_match
|
||||
big5_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("Big5-HKSCS:2008", "Big5-HKSCS")
|
||||
|
||||
|
@ -363,5 +367,7 @@ OnigEncodingDefine(big5_uao, BIG5_UAO) = {
|
|||
big5_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
big5_left_adjust_char_head,
|
||||
big5_is_allowed_reverse_match
|
||||
big5_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
|
|
|
@ -210,7 +210,9 @@ OnigEncodingDefine(cp949, CP949) = {
|
|||
cp949_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
cp949_left_adjust_char_head,
|
||||
cp949_is_allowed_reverse_match
|
||||
cp949_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
/*
|
||||
* Name: CP949
|
||||
|
|
|
@ -334,7 +334,8 @@ OnigEncodingDefine(emacs_mule, Emacs_Mule) = {
|
|||
onigenc_not_support_get_ctype_code_range,
|
||||
left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
|
||||
ENC_REPLICATE("stateless-ISO-2022-JP", "Emacs-Mule")
|
||||
|
|
285
enc/euc_jp.c
285
enc/euc_jp.c
|
@ -1,8 +1,9 @@
|
|||
/**********************************************************************
|
||||
euc_jp.c - Oniguruma (regular expression library)
|
||||
euc_jp.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -29,7 +30,6 @@
|
|||
|
||||
#include "regint.h"
|
||||
|
||||
|
||||
#define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
|
||||
|
||||
static const int EncLen_EUCJP[] = {
|
||||
|
@ -114,6 +114,97 @@ static const signed char trans[][0x100] = {
|
|||
#undef A
|
||||
#undef F
|
||||
|
||||
static const OnigPairCaseFoldCodes CaseFoldMap[] = {
|
||||
/* Fullwidth Alphabet */
|
||||
{ 0xa3c1, 0xa3e1 },
|
||||
{ 0xa3c2, 0xa3e2 },
|
||||
{ 0xa3c3, 0xa3e3 },
|
||||
{ 0xa3c4, 0xa3e4 },
|
||||
{ 0xa3c5, 0xa3e5 },
|
||||
{ 0xa3c6, 0xa3e6 },
|
||||
{ 0xa3c7, 0xa3e7 },
|
||||
{ 0xa3c8, 0xa3e8 },
|
||||
{ 0xa3c9, 0xa3e9 },
|
||||
{ 0xa3ca, 0xa3ea },
|
||||
{ 0xa3cb, 0xa3eb },
|
||||
{ 0xa3cc, 0xa3ec },
|
||||
{ 0xa3cd, 0xa3ed },
|
||||
{ 0xa3ce, 0xa3ee },
|
||||
{ 0xa3cf, 0xa3ef },
|
||||
{ 0xa3d0, 0xa3f0 },
|
||||
{ 0xa3d1, 0xa3f1 },
|
||||
{ 0xa3d2, 0xa3f2 },
|
||||
{ 0xa3d3, 0xa3f3 },
|
||||
{ 0xa3d4, 0xa3f4 },
|
||||
{ 0xa3d5, 0xa3f5 },
|
||||
{ 0xa3d6, 0xa3f6 },
|
||||
{ 0xa3d7, 0xa3f7 },
|
||||
{ 0xa3d8, 0xa3f8 },
|
||||
{ 0xa3d9, 0xa3f9 },
|
||||
{ 0xa3da, 0xa3fa },
|
||||
|
||||
/* Greek */
|
||||
{ 0xa6a1, 0xa6c1 },
|
||||
{ 0xa6a2, 0xa6c2 },
|
||||
{ 0xa6a3, 0xa6c3 },
|
||||
{ 0xa6a4, 0xa6c4 },
|
||||
{ 0xa6a5, 0xa6c5 },
|
||||
{ 0xa6a6, 0xa6c6 },
|
||||
{ 0xa6a7, 0xa6c7 },
|
||||
{ 0xa6a8, 0xa6c8 },
|
||||
{ 0xa6a9, 0xa6c9 },
|
||||
{ 0xa6aa, 0xa6ca },
|
||||
{ 0xa6ab, 0xa6cb },
|
||||
{ 0xa6ac, 0xa6cc },
|
||||
{ 0xa6ad, 0xa6cd },
|
||||
{ 0xa6ae, 0xa6ce },
|
||||
{ 0xa6af, 0xa6cf },
|
||||
{ 0xa6b0, 0xa6d0 },
|
||||
{ 0xa6b1, 0xa6d1 },
|
||||
{ 0xa6b2, 0xa6d2 },
|
||||
{ 0xa6b3, 0xa6d3 },
|
||||
{ 0xa6b4, 0xa6d4 },
|
||||
{ 0xa6b5, 0xa6d5 },
|
||||
{ 0xa6b6, 0xa6d6 },
|
||||
{ 0xa6b7, 0xa6d7 },
|
||||
{ 0xa6b8, 0xa6d8 },
|
||||
|
||||
/* Cyrillic */
|
||||
{ 0xa7a1, 0xa7d1 },
|
||||
{ 0xa7a2, 0xa7d2 },
|
||||
{ 0xa7a3, 0xa7d3 },
|
||||
{ 0xa7a4, 0xa7d4 },
|
||||
{ 0xa7a5, 0xa7d5 },
|
||||
{ 0xa7a6, 0xa7d6 },
|
||||
{ 0xa7a7, 0xa7d7 },
|
||||
{ 0xa7a8, 0xa7d8 },
|
||||
{ 0xa7a9, 0xa7d9 },
|
||||
{ 0xa7aa, 0xa7da },
|
||||
{ 0xa7ab, 0xa7db },
|
||||
{ 0xa7ac, 0xa7dc },
|
||||
{ 0xa7ad, 0xa7dd },
|
||||
{ 0xa7ae, 0xa7de },
|
||||
{ 0xa7af, 0xa7df },
|
||||
{ 0xa7b0, 0xa7e0 },
|
||||
{ 0xa7b1, 0xa7e1 },
|
||||
{ 0xa7b2, 0xa7e2 },
|
||||
{ 0xa7b3, 0xa7e3 },
|
||||
{ 0xa7b4, 0xa7e4 },
|
||||
{ 0xa7b5, 0xa7e5 },
|
||||
{ 0xa7b6, 0xa7e6 },
|
||||
{ 0xa7b7, 0xa7e7 },
|
||||
{ 0xa7b8, 0xa7e8 },
|
||||
{ 0xa7b9, 0xa7e9 },
|
||||
{ 0xa7ba, 0xa7ea },
|
||||
{ 0xa7bb, 0xa7eb },
|
||||
{ 0xa7bc, 0xa7ec },
|
||||
{ 0xa7bd, 0xa7ed },
|
||||
{ 0xa7be, 0xa7ee },
|
||||
{ 0xa7bf, 0xa7ef },
|
||||
{ 0xa7c0, 0xa7f0 },
|
||||
{ 0xa7c1, 0xa7f1 },
|
||||
};
|
||||
|
||||
static int
|
||||
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
|
||||
{
|
||||
|
@ -138,7 +229,7 @@ mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
|
|||
int c, i, len;
|
||||
OnigCodePoint n;
|
||||
|
||||
len = enclen(enc, p, end);
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
n = (OnigCodePoint )*p++;
|
||||
if (len == 1) return n;
|
||||
|
||||
|
@ -154,10 +245,10 @@ static int
|
|||
code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
|
||||
{
|
||||
if (ONIGENC_IS_CODE_ASCII(code)) return 1;
|
||||
else if (code > 0xffffff)
|
||||
return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
|
||||
else if (code & 0x800000) return 3;
|
||||
else if (code & 0x8000) return 2;
|
||||
else if (code > 0x00ffffff)
|
||||
return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
|
||||
else if ((code & 0xff808080) == 0x00808080) return 3;
|
||||
else if ((code & 0xffff8080) == 0x00008080) return 2;
|
||||
else
|
||||
return ONIGERR_INVALID_CODE_POINT_VALUE;
|
||||
}
|
||||
|
@ -191,10 +282,87 @@ code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
|
|||
*p++ = (UChar )(code & 0xff);
|
||||
|
||||
#if 1
|
||||
if (enclen(enc, buf, p) != (p - buf))
|
||||
if (mbc_enc_len(buf, p, enc) != (p - buf))
|
||||
return ONIGERR_INVALID_CODE_POINT_VALUE;
|
||||
#endif
|
||||
return (int)(p - buf);
|
||||
return (int )(p - buf);
|
||||
}
|
||||
|
||||
static int
|
||||
apply_all_case_fold(OnigCaseFoldType flag,
|
||||
OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc)
|
||||
{
|
||||
return onigenc_apply_all_case_fold_with_map(
|
||||
sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 0,
|
||||
flag, f, arg);
|
||||
}
|
||||
|
||||
static OnigCodePoint
|
||||
get_lower_case(OnigCodePoint code)
|
||||
{
|
||||
if (ONIGENC_IS_IN_RANGE(code, 0xa3c1, 0xa3da)) {
|
||||
/* Fullwidth Alphabet */
|
||||
return (OnigCodePoint )(code + 0x0020);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0xa6a1, 0xa6b8)) {
|
||||
/* Greek */
|
||||
return (OnigCodePoint )(code + 0x0020);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0xa7a1, 0xa7c1)) {
|
||||
/* Cyrillic */
|
||||
return (OnigCodePoint )(code + 0x0030);
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
static OnigCodePoint
|
||||
get_upper_case(OnigCodePoint code)
|
||||
{
|
||||
if (ONIGENC_IS_IN_RANGE(code, 0xa3e1, 0xa3fa)) {
|
||||
/* Fullwidth Alphabet */
|
||||
return (OnigCodePoint )(code - 0x0020);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0xa6c1, 0xa6d8)) {
|
||||
/* Greek */
|
||||
return (OnigCodePoint )(code - 0x0020);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0xa7d1, 0xa7f1)) {
|
||||
/* Cyrillic */
|
||||
return (OnigCodePoint )(code - 0x0030);
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
static int
|
||||
get_case_fold_codes_by_str(OnigCaseFoldType flag,
|
||||
const OnigUChar* p, const OnigUChar* end,
|
||||
OnigCaseFoldCodeItem items[], OnigEncoding enc)
|
||||
{
|
||||
int len;
|
||||
OnigCodePoint code, code_lo, code_up;
|
||||
|
||||
code = mbc_to_code(p, end, enc);
|
||||
if (ONIGENC_IS_ASCII_CODE(code))
|
||||
return onigenc_ascii_get_case_fold_codes_by_str(flag, p, end, items, enc);
|
||||
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
code_lo = get_lower_case(code);
|
||||
code_up = get_upper_case(code);
|
||||
|
||||
if (code != code_lo) {
|
||||
items[0].byte_len = len;
|
||||
items[0].code_len = 1;
|
||||
items[0].code[0] = code_lo;
|
||||
return 1;
|
||||
}
|
||||
else if (code != code_up) {
|
||||
items[0].byte_len = len;
|
||||
items[0].code_len = 1;
|
||||
items[0].code[0] = code_up;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -202,7 +370,6 @@ mbc_case_fold(OnigCaseFoldType flag,
|
|||
const UChar** pp, const UChar* end, UChar* lower,
|
||||
OnigEncoding enc)
|
||||
{
|
||||
int len;
|
||||
const UChar* p = *pp;
|
||||
|
||||
if (ONIGENC_IS_MBC_ASCII(p)) {
|
||||
|
@ -211,12 +378,11 @@ mbc_case_fold(OnigCaseFoldType flag,
|
|||
return 1;
|
||||
}
|
||||
else {
|
||||
int i;
|
||||
OnigCodePoint code;
|
||||
int len;
|
||||
|
||||
len = enclen(enc, p, end);
|
||||
for (i = 0; i < len; i++) {
|
||||
*lower++ = *p++;
|
||||
}
|
||||
code = get_lower_case(mbc_to_code(p, end, enc));
|
||||
len = code_to_mbc(code, lower, enc);
|
||||
(*pp) += len;
|
||||
return len; /* return byte length of converted char to lower */
|
||||
}
|
||||
|
@ -235,7 +401,7 @@ left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, Onig
|
|||
p = s;
|
||||
|
||||
while (!eucjp_islead(*p) && p > start) p--;
|
||||
len = enclen(enc, p, end);
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
if (p + len > s) return (UChar* )p;
|
||||
p += len;
|
||||
return (UChar* )(p + ((s - p) & ~1));
|
||||
|
@ -260,15 +426,83 @@ static hash_table_type* PropertyNameTable;
|
|||
|
||||
static const OnigCodePoint CR_Hiragana[] = {
|
||||
1,
|
||||
#ifdef ENC_EUC_JIS_2004
|
||||
0xa4a1, 0xa4fb
|
||||
#else
|
||||
0xa4a1, 0xa4f3
|
||||
#endif
|
||||
}; /* CR_Hiragana */
|
||||
|
||||
#ifdef ENC_EUC_JIS_2004
|
||||
static const OnigCodePoint CR_Katakana[] = {
|
||||
5,
|
||||
0x8ea6, 0x8eaf, /* JIS X 0201 Katakana */
|
||||
0x8eb1, 0x8edd, /* JIS X 0201 Katakana */
|
||||
0xa5a1, 0xa5fe,
|
||||
0xa6ee, 0xa6fe,
|
||||
0xa7f2, 0xa7f5,
|
||||
}; /* CR_Katakana */
|
||||
#else
|
||||
static const OnigCodePoint CR_Katakana[] = {
|
||||
3,
|
||||
0x8ea6, 0x8eaf, /* JIS X 0201 Katakana */
|
||||
0x8eb1, 0x8edd, /* JIS X 0201 Katakana */
|
||||
0xa5a1, 0xa5f6,
|
||||
0xaaa6, 0xaaaf,
|
||||
0xaab1, 0xaadd
|
||||
}; /* CR_Katakana */
|
||||
#endif
|
||||
|
||||
#ifdef ENC_EUC_JIS_2004
|
||||
static const OnigCodePoint CR_Han[] = {
|
||||
/* EUC-JIS-2004 (JIS X 0213:2004) */
|
||||
7,
|
||||
/* plane 1 */
|
||||
0xa1b8, 0xa1b8,
|
||||
0xaea1, 0xfefe, /* Kanji level 1, 2 and 3 */
|
||||
/* plane 2 */
|
||||
0x8fa1a1, 0x8fa1fe, /* row 1 */
|
||||
0x8fa3a1, 0x8fa5fe, /* row 3 .. 5 */
|
||||
0x8fa8a1, 0x8fa8fe, /* row 8 */
|
||||
0x8faca1, 0x8faffe, /* row 12 .. 15 */
|
||||
0x8feea1, 0x8ffef6, /* row 78 .. 94 */
|
||||
}; /* CR_Han */
|
||||
#else
|
||||
static const OnigCodePoint CR_Han[] = {
|
||||
/* EUC-JP (JIS X 0208 based) */
|
||||
4,
|
||||
0xa1b8, 0xa1b8,
|
||||
0xb0a1, 0xcfd3, /* Kanji level 1 */
|
||||
0xd0a1, 0xf4a6, /* Kanji level 2 */
|
||||
0x8fb0a1, 0x8fedf3 /* JIS X 0212 Supplemental Kanji (row 16 .. 77) */
|
||||
}; /* CR_Han */
|
||||
#endif
|
||||
|
||||
static const OnigCodePoint CR_Latin[] = {
|
||||
4,
|
||||
0x0041, 0x005a,
|
||||
0x0061, 0x007a,
|
||||
0xa3c1, 0xa3da,
|
||||
0xa3e1, 0xa3fa,
|
||||
/* TODO: add raw 8 .. 11 to support EUC-JIS-2004 */
|
||||
/* TODO: add JIS X 0212 row 9 .. 11 */
|
||||
}; /* CR_Latin */
|
||||
|
||||
static const OnigCodePoint CR_Greek[] = {
|
||||
2,
|
||||
0xa6a1, 0xa6b8,
|
||||
#ifdef ENC_EUC_JIS_2004
|
||||
0xa6c1, 0xa6d9,
|
||||
#else
|
||||
0xa6c1, 0xa6d8,
|
||||
/* TODO: add JIS X 0212 row 6 */
|
||||
#endif
|
||||
}; /* CR_Greek */
|
||||
|
||||
static const OnigCodePoint CR_Cyrillic[] = {
|
||||
2,
|
||||
0xa7a1, 0xa7c1,
|
||||
0xa7d1, 0xa7f1,
|
||||
/* TODO: add JIS X 0212 row 7 */
|
||||
}; /* CR_Cyrillic */
|
||||
|
||||
static int
|
||||
init_property_list(void)
|
||||
|
@ -277,6 +511,10 @@ init_property_list(void)
|
|||
|
||||
PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
|
||||
PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
|
||||
PROPERTY_LIST_ADD_PROP("han", CR_Han);
|
||||
PROPERTY_LIST_ADD_PROP("latin", CR_Latin);
|
||||
PROPERTY_LIST_ADD_PROP("greek", CR_Greek);
|
||||
PROPERTY_LIST_ADD_PROP("cyrillic", CR_Cyrillic);
|
||||
PropertyInited = 1;
|
||||
|
||||
end:
|
||||
|
@ -300,7 +538,7 @@ property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
|
|||
return onigenc_minimum_property_name_to_ctype(enc, s, e);
|
||||
}
|
||||
|
||||
return (int)ctype;
|
||||
return (int )ctype;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -360,14 +598,15 @@ OnigEncodingDefine(euc_jp, EUC_JP) = {
|
|||
code_to_mbclen,
|
||||
code_to_mbc,
|
||||
mbc_case_fold,
|
||||
onigenc_ascii_apply_all_case_fold,
|
||||
onigenc_ascii_get_case_fold_codes_by_str,
|
||||
apply_all_case_fold,
|
||||
get_case_fold_codes_by_str,
|
||||
property_name_to_ctype,
|
||||
is_code_ctype,
|
||||
get_ctype_code_range,
|
||||
left_adjust_char_head,
|
||||
is_allowed_reverse_match,
|
||||
0
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
/*
|
||||
* Name: EUC-JP
|
||||
|
|
|
@ -187,6 +187,8 @@ OnigEncodingDefine(euc_kr, EUC_KR) = {
|
|||
euckr_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
euckr_left_adjust_char_head,
|
||||
euckr_is_allowed_reverse_match
|
||||
euckr_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("eucKR", "EUC-KR")
|
||||
|
|
|
@ -220,6 +220,8 @@ OnigEncodingDefine(euc_tw, EUC_TW) = {
|
|||
euctw_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
euctw_left_adjust_char_head,
|
||||
euctw_is_allowed_reverse_match
|
||||
euctw_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("eucTW", "EUC-TW")
|
||||
|
|
|
@ -596,6 +596,8 @@ OnigEncodingDefine(gb18030, GB18030) = {
|
|||
gb18030_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
gb18030_left_adjust_char_head,
|
||||
gb18030_is_allowed_reverse_match
|
||||
gb18030_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
|
||||
|
|
|
@ -210,7 +210,9 @@ OnigEncodingDefine(gbk, GBK) = {
|
|||
gbk_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
gbk_left_adjust_char_head,
|
||||
gbk_is_allowed_reverse_match
|
||||
gbk_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
/*
|
||||
* Name: GBK
|
||||
|
|
|
@ -272,7 +272,9 @@ OnigEncodingDefine(iso_8859_1, ISO_8859_1) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-1", "ISO-8859-1")
|
||||
|
||||
|
|
|
@ -239,6 +239,8 @@ OnigEncodingDefine(iso_8859_10, ISO_8859_10) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-10", "ISO-8859-10")
|
||||
|
|
|
@ -92,7 +92,9 @@ OnigEncodingDefine(iso_8859_11, ISO_8859_11) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-11", "ISO-8859-11")
|
||||
|
||||
|
|
|
@ -228,7 +228,9 @@ OnigEncodingDefine(iso_8859_13, ISO_8859_13) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-13", "ISO-8859-13")
|
||||
|
||||
|
|
|
@ -241,6 +241,8 @@ OnigEncodingDefine(iso_8859_14, ISO_8859_14) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-14", "ISO-8859-14")
|
||||
|
|
|
@ -235,6 +235,8 @@ OnigEncodingDefine(iso_8859_15, ISO_8859_15) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-15", "ISO-8859-15")
|
||||
|
|
|
@ -237,6 +237,8 @@ OnigEncodingDefine(iso_8859_16, ISO_8859_16) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-16", "ISO-8859-16")
|
||||
|
|
|
@ -237,7 +237,9 @@ OnigEncodingDefine(iso_8859_2, ISO_8859_2) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-2", "ISO-8859-2")
|
||||
|
||||
|
|
|
@ -235,6 +235,8 @@ OnigEncodingDefine(iso_8859_3, ISO_8859_3) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-3", "ISO-8859-3")
|
||||
|
|
|
@ -237,6 +237,8 @@ OnigEncodingDefine(iso_8859_4, ISO_8859_4) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-4", "ISO-8859-4")
|
||||
|
|
|
@ -225,6 +225,8 @@ OnigEncodingDefine(iso_8859_5, ISO_8859_5) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-5", "ISO-8859-5")
|
||||
|
|
|
@ -92,7 +92,9 @@ OnigEncodingDefine(iso_8859_6, ISO_8859_6) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-6", "ISO-8859-6")
|
||||
|
||||
|
|
|
@ -222,7 +222,9 @@ OnigEncodingDefine(iso_8859_7, ISO_8859_7) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-7", "ISO-8859-7")
|
||||
|
||||
|
|
|
@ -92,7 +92,9 @@ OnigEncodingDefine(iso_8859_8, ISO_8859_8) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-8", "ISO-8859-8")
|
||||
|
||||
|
|
|
@ -228,7 +228,9 @@ OnigEncodingDefine(iso_8859_9, ISO_8859_9) = {
|
|||
is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ISO8859-9", "ISO-8859-9")
|
||||
|
||||
|
|
|
@ -213,7 +213,9 @@ OnigEncodingDefine(koi8_r, KOI8_R) = {
|
|||
koi8_r_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("CP878", "KOI8-R")
|
||||
|
||||
|
|
|
@ -217,5 +217,7 @@ OnigEncodingDefine(koi8_u, KOI8_U) = {
|
|||
koi8_u_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
|
|
1162
enc/mktable.c
Normal file
1162
enc/mktable.c
Normal file
File diff suppressed because it is too large
Load diff
280
enc/shift_jis.c
280
enc/shift_jis.c
|
@ -1,8 +1,9 @@
|
|||
/**********************************************************************
|
||||
sjis.c - Oniguruma (regular expression library)
|
||||
sjis.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -67,6 +68,97 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
|
|||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
|
||||
};
|
||||
|
||||
static const OnigPairCaseFoldCodes CaseFoldMap[] = {
|
||||
/* Fullwidth Alphabet */
|
||||
{ 0x8260, 0x8281 },
|
||||
{ 0x8261, 0x8282 },
|
||||
{ 0x8262, 0x8283 },
|
||||
{ 0x8263, 0x8284 },
|
||||
{ 0x8264, 0x8285 },
|
||||
{ 0x8265, 0x8286 },
|
||||
{ 0x8266, 0x8287 },
|
||||
{ 0x8267, 0x8288 },
|
||||
{ 0x8268, 0x8289 },
|
||||
{ 0x8269, 0x828a },
|
||||
{ 0x826a, 0x828b },
|
||||
{ 0x826b, 0x828c },
|
||||
{ 0x826c, 0x828d },
|
||||
{ 0x826d, 0x828e },
|
||||
{ 0x826e, 0x828f },
|
||||
{ 0x826f, 0x8290 },
|
||||
{ 0x8270, 0x8291 },
|
||||
{ 0x8271, 0x8292 },
|
||||
{ 0x8272, 0x8293 },
|
||||
{ 0x8273, 0x8294 },
|
||||
{ 0x8274, 0x8295 },
|
||||
{ 0x8275, 0x8296 },
|
||||
{ 0x8276, 0x8297 },
|
||||
{ 0x8277, 0x8298 },
|
||||
{ 0x8278, 0x8299 },
|
||||
{ 0x8279, 0x829a },
|
||||
|
||||
/* Greek */
|
||||
{ 0x839f, 0x83bf },
|
||||
{ 0x83a0, 0x83c0 },
|
||||
{ 0x83a1, 0x83c1 },
|
||||
{ 0x83a2, 0x83c2 },
|
||||
{ 0x83a3, 0x83c3 },
|
||||
{ 0x83a4, 0x83c4 },
|
||||
{ 0x83a5, 0x83c5 },
|
||||
{ 0x83a6, 0x83c6 },
|
||||
{ 0x83a7, 0x83c7 },
|
||||
{ 0x83a8, 0x83c8 },
|
||||
{ 0x83a9, 0x83c9 },
|
||||
{ 0x83aa, 0x83ca },
|
||||
{ 0x83ab, 0x83cb },
|
||||
{ 0x83ac, 0x83cc },
|
||||
{ 0x83ad, 0x83cd },
|
||||
{ 0x83ae, 0x83ce },
|
||||
{ 0x83af, 0x83cf },
|
||||
{ 0x83b0, 0x83d0 },
|
||||
{ 0x83b1, 0x83d1 },
|
||||
{ 0x83b2, 0x83d2 },
|
||||
{ 0x83b3, 0x83d3 },
|
||||
{ 0x83b4, 0x83d4 },
|
||||
{ 0x83b5, 0x83d5 },
|
||||
{ 0x83b6, 0x83d6 },
|
||||
|
||||
/* Cyrillic */
|
||||
{ 0x8440, 0x8470 },
|
||||
{ 0x8441, 0x8471 },
|
||||
{ 0x8442, 0x8472 },
|
||||
{ 0x8443, 0x8473 },
|
||||
{ 0x8444, 0x8474 },
|
||||
{ 0x8445, 0x8475 },
|
||||
{ 0x8446, 0x8476 },
|
||||
{ 0x8447, 0x8477 },
|
||||
{ 0x8448, 0x8478 },
|
||||
{ 0x8449, 0x8479 },
|
||||
{ 0x844a, 0x847a },
|
||||
{ 0x844b, 0x847b },
|
||||
{ 0x844c, 0x847c },
|
||||
{ 0x844d, 0x847d },
|
||||
{ 0x844e, 0x847e },
|
||||
{ 0x844f, 0x8480 },
|
||||
{ 0x8450, 0x8481 },
|
||||
{ 0x8451, 0x8482 },
|
||||
{ 0x8452, 0x8483 },
|
||||
{ 0x8453, 0x8484 },
|
||||
{ 0x8454, 0x8485 },
|
||||
{ 0x8455, 0x8486 },
|
||||
{ 0x8456, 0x8487 },
|
||||
{ 0x8457, 0x8488 },
|
||||
{ 0x8458, 0x8489 },
|
||||
{ 0x8459, 0x848a },
|
||||
{ 0x845a, 0x848b },
|
||||
{ 0x845b, 0x848c },
|
||||
{ 0x845c, 0x848d },
|
||||
{ 0x845d, 0x848e },
|
||||
{ 0x845e, 0x848f },
|
||||
{ 0x845f, 0x8490 },
|
||||
{ 0x8460, 0x8491 },
|
||||
};
|
||||
|
||||
#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
|
||||
#define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
|
||||
|
||||
|
@ -150,7 +242,7 @@ mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
|
|||
int c, i, len;
|
||||
OnigCodePoint n;
|
||||
|
||||
len = enclen(enc, p, end);
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
c = *p++;
|
||||
n = c;
|
||||
if (len == 1) return n;
|
||||
|
@ -172,10 +264,90 @@ code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
|
|||
*p++ = (UChar )(code & 0xff);
|
||||
|
||||
#if 0
|
||||
if (enclen(enc, buf) != (p - buf))
|
||||
if (mbc_enc_len(buf, p, enc) != (p - buf))
|
||||
return REGERR_INVALID_CODE_POINT_VALUE;
|
||||
#endif
|
||||
return (int)(p - buf);
|
||||
return (int )(p - buf);
|
||||
}
|
||||
|
||||
static int
|
||||
apply_all_case_fold(OnigCaseFoldType flag,
|
||||
OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc)
|
||||
{
|
||||
return onigenc_apply_all_case_fold_with_map(
|
||||
sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 0,
|
||||
flag, f, arg);
|
||||
}
|
||||
|
||||
static OnigCodePoint
|
||||
get_lower_case(OnigCodePoint code)
|
||||
{
|
||||
if (ONIGENC_IS_IN_RANGE(code, 0x8260, 0x8279)) {
|
||||
/* Fullwidth Alphabet */
|
||||
return (OnigCodePoint )(code + 0x0021);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0x839f, 0x83b6)) {
|
||||
/* Greek */
|
||||
return (OnigCodePoint )(code + 0x0020);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0x8440, 0x8460)) {
|
||||
/* Cyrillic */
|
||||
int d = (code >= 0x844f) ? 1 : 0;
|
||||
return (OnigCodePoint )(code + (0x0030 + d));
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
static OnigCodePoint
|
||||
get_upper_case(OnigCodePoint code)
|
||||
{
|
||||
if (ONIGENC_IS_IN_RANGE(code, 0x8281, 0x829a)) {
|
||||
/* Fullwidth Alphabet */
|
||||
return (OnigCodePoint )(code - 0x0021);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0x83bf, 0x83d6)) {
|
||||
/* Greek */
|
||||
return (OnigCodePoint )(code - 0x0020);
|
||||
}
|
||||
else if (ONIGENC_IS_IN_RANGE(code, 0x8470, 0x847e) ||
|
||||
ONIGENC_IS_IN_RANGE(code, 0x8480, 0x8491)) {
|
||||
/* Cyrillic */
|
||||
int d = (code >= 0x8480) ? 1 : 0;
|
||||
return (OnigCodePoint )(code - (0x0030 - d));
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
static int
|
||||
get_case_fold_codes_by_str(OnigCaseFoldType flag,
|
||||
const OnigUChar* p, const OnigUChar* end,
|
||||
OnigCaseFoldCodeItem items[], OnigEncoding enc)
|
||||
{
|
||||
int len;
|
||||
OnigCodePoint code, code_lo, code_up;
|
||||
|
||||
code = mbc_to_code(p, end, enc);
|
||||
if (ONIGENC_IS_ASCII_CODE(code))
|
||||
return onigenc_ascii_get_case_fold_codes_by_str(flag, p, end, items, enc);
|
||||
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
code_lo = get_lower_case(code);
|
||||
code_up = get_upper_case(code);
|
||||
|
||||
if (code != code_lo) {
|
||||
items[0].byte_len = len;
|
||||
items[0].code_len = 1;
|
||||
items[0].code[0] = code_lo;
|
||||
return 1;
|
||||
}
|
||||
else if (code != code_up) {
|
||||
items[0].byte_len = len;
|
||||
items[0].code_len = 1;
|
||||
items[0].code[0] = code_up;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -191,12 +363,11 @@ mbc_case_fold(OnigCaseFoldType flag,
|
|||
return 1;
|
||||
}
|
||||
else {
|
||||
int i;
|
||||
int len = enclen(enc, p, end);
|
||||
OnigCodePoint code;
|
||||
int len;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
*lower++ = *p++;
|
||||
}
|
||||
code = get_lower_case(mbc_to_code(p, end, enc));
|
||||
len = code_to_mbc(code, lower, enc);
|
||||
(*pp) += len;
|
||||
return len; /* return byte length of converted char to lower */
|
||||
}
|
||||
|
@ -245,7 +416,7 @@ left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, Onig
|
|||
}
|
||||
}
|
||||
}
|
||||
len = enclen(enc, p, end);
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
if (p + len > s) return (UChar* )p;
|
||||
p += len;
|
||||
return (UChar* )(p + ((s - p) & ~1));
|
||||
|
@ -278,6 +449,47 @@ static const OnigCodePoint CR_Katakana[] = {
|
|||
0x8380, 0x8396,
|
||||
}; /* CR_Katakana */
|
||||
|
||||
#ifdef ENC_CP932
|
||||
static const OnigCodePoint CR_Han[] = {
|
||||
6,
|
||||
0x8157, 0x8157,
|
||||
0x889f, 0x9872, /* Kanji level 1 */
|
||||
0x989f, 0x9ffc, /* Kanji level 2 */
|
||||
0xe040, 0xeaa4, /* Kanji level 2 */
|
||||
0xed40, 0xeeec, /* NEC-selected IBM extended characters (without symbols) */
|
||||
0xfa5c, 0xfc4b, /* IBM extended characters (without symbols) */
|
||||
}; /* CR_Han */
|
||||
#else
|
||||
static const OnigCodePoint CR_Han[] = {
|
||||
4,
|
||||
0x8157, 0x8157,
|
||||
0x889f, 0x9872, /* Kanji level 1 */
|
||||
0x989f, 0x9ffc, /* Kanji level 2 */
|
||||
0xe040, 0xeaa4, /* Kanji level 2 */
|
||||
}; /* CR_Han */
|
||||
#endif
|
||||
|
||||
static const OnigCodePoint CR_Latin[] = {
|
||||
4,
|
||||
0x0041, 0x005a,
|
||||
0x0061, 0x007a,
|
||||
0x8260, 0x8279,
|
||||
0x8281, 0x829a,
|
||||
}; /* CR_Latin */
|
||||
|
||||
static const OnigCodePoint CR_Greek[] = {
|
||||
2,
|
||||
0x839f, 0x83b6,
|
||||
0x83bf, 0x83d6,
|
||||
}; /* CR_Greek */
|
||||
|
||||
static const OnigCodePoint CR_Cyrillic[] = {
|
||||
3,
|
||||
0x8440, 0x8460,
|
||||
0x8470, 0x847f,
|
||||
0x8480, 0x8491,
|
||||
}; /* CR_Cyrillic */
|
||||
|
||||
static int
|
||||
init_property_list(void)
|
||||
{
|
||||
|
@ -285,6 +497,10 @@ init_property_list(void)
|
|||
|
||||
PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
|
||||
PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
|
||||
PROPERTY_LIST_ADD_PROP("han", CR_Han);
|
||||
PROPERTY_LIST_ADD_PROP("latin", CR_Latin);
|
||||
PROPERTY_LIST_ADD_PROP("greek", CR_Greek);
|
||||
PROPERTY_LIST_ADD_PROP("cyrillic", CR_Cyrillic);
|
||||
PropertyInited = 1;
|
||||
|
||||
end:
|
||||
|
@ -308,7 +524,7 @@ property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
|
|||
return onigenc_minimum_property_name_to_ctype(enc, s, e);
|
||||
}
|
||||
|
||||
return (int)ctype;
|
||||
return (int )ctype;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -357,6 +573,7 @@ get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef ENC_CP932
|
||||
OnigEncodingDefine(shift_jis, Shift_JIS) = {
|
||||
mbc_enc_len,
|
||||
"Shift_JIS", /* name */
|
||||
|
@ -367,55 +584,23 @@ OnigEncodingDefine(shift_jis, Shift_JIS) = {
|
|||
code_to_mbclen,
|
||||
code_to_mbc,
|
||||
mbc_case_fold,
|
||||
onigenc_ascii_apply_all_case_fold,
|
||||
onigenc_ascii_get_case_fold_codes_by_str,
|
||||
apply_all_case_fold,
|
||||
get_case_fold_codes_by_str,
|
||||
property_name_to_ctype,
|
||||
is_code_ctype,
|
||||
get_ctype_code_range,
|
||||
left_adjust_char_head,
|
||||
is_allowed_reverse_match,
|
||||
0
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
/*
|
||||
* Name: Shift_JIS
|
||||
* MIBenum: 17
|
||||
* Link: http://www.iana.org/assignments/character-sets
|
||||
* Link: http://ja.wikipedia.org/wiki/Shift_JIS
|
||||
*
|
||||
* Note that this Shift_JIS's 7bit part is US-ASCII not JIX X 0201
|
||||
* because Shift_JIS must be ASCII compatible encoding.
|
||||
* See also the conversion table (enc/trans/japanese_sjis.trans).
|
||||
*/
|
||||
|
||||
/*
|
||||
* Name: Windows-31J
|
||||
* MIBenum: 2024
|
||||
* Link: http://www.iana.org/assignments/character-sets
|
||||
* Link: http://www.microsoft.com/globaldev/reference/dbcs/932.mspx
|
||||
* Link: http://ja.wikipedia.org/wiki/Windows-31J
|
||||
* Link: http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-932-2000.ucm
|
||||
*
|
||||
* Windows Standard Character Set and its mapping to Unicode by Microsoft.
|
||||
* Since 1.9.3, SJIS is the alias of Windows-31J because its character
|
||||
* set is usually this one even if its mapping may differ.
|
||||
*/
|
||||
ENC_REPLICATE("Windows-31J", "Shift_JIS")
|
||||
ENC_ALIAS("CP932", "Windows-31J")
|
||||
ENC_ALIAS("csWindows31J", "Windows-31J") /* IANA. IE6 don't accept Windows-31J but csWindows31J. */
|
||||
ENC_ALIAS("SJIS", "Windows-31J")
|
||||
|
||||
/*
|
||||
* Name: PCK
|
||||
* Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/x-2chn0/index.html
|
||||
* Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/appb-pckwarn-1/index.html
|
||||
*
|
||||
* Solaris's SJIS variant. Its set is Windows Standard Character Set; it
|
||||
* consists JIS X 0201 Latin (US-ASCII), JIS X 0201 Katakana, JIS X 0208, NEC
|
||||
* special characters, NEC-selected IBM extended characters, and IBM extended
|
||||
* characters. Solaris's iconv seems to use SJIS-open.
|
||||
*/
|
||||
ENC_ALIAS("PCK", "Windows-31J")
|
||||
|
||||
/*
|
||||
* Name: MacJapanese
|
||||
* Link: http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT
|
||||
|
@ -423,3 +608,4 @@ ENC_ALIAS("PCK", "Windows-31J")
|
|||
*/
|
||||
ENC_REPLICATE("MacJapanese", "Shift_JIS")
|
||||
ENC_ALIAS("MacJapan", "MacJapanese")
|
||||
#endif
|
||||
|
|
1941
enc/unicode.c
1941
enc/unicode.c
File diff suppressed because it is too large
Load diff
2230
enc/unicode/casefold.h
Normal file
2230
enc/unicode/casefold.h
Normal file
File diff suppressed because it is too large
Load diff
15788
enc/unicode/name2ctype.h
15788
enc/unicode/name2ctype.h
File diff suppressed because it is too large
Load diff
|
@ -24,7 +24,9 @@ OnigEncodingDefine(us_ascii, US_ASCII) = {
|
|||
onigenc_ascii_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
ENC_ALIAS("ASCII", "US-ASCII")
|
||||
ENC_ALIAS("ANSI_X3.4-1968", "US-ASCII")
|
||||
|
|
|
@ -88,11 +88,8 @@ utf16be_is_mbc_newline(const UChar* p, const UChar* end,
|
|||
if (*(p+1) == 0x0a && *p == 0x00)
|
||||
return 1;
|
||||
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
|
||||
if ((
|
||||
#ifndef USE_CRNL_AS_LINE_TERMINATOR
|
||||
*(p+1) == 0x0d ||
|
||||
#endif
|
||||
*(p+1) == 0x85) && *p == 0x00)
|
||||
if ((*(p+1) == 0x0b || *(p+1) == 0x0c || *(p+1) == 0x0d || *(p+1) == 0x85)
|
||||
&& *p == 0x00)
|
||||
return 1;
|
||||
if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
|
||||
return 1;
|
||||
|
@ -252,6 +249,8 @@ OnigEncodingDefine(utf_16be, UTF_16BE) = {
|
|||
onigenc_unicode_is_code_ctype,
|
||||
onigenc_utf16_32_get_ctype_code_range,
|
||||
utf16be_left_adjust_char_head,
|
||||
onigenc_always_false_is_allowed_reverse_match
|
||||
onigenc_always_false_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_UNICODE,
|
||||
};
|
||||
ENC_ALIAS("UCS-2BE", "UTF-16BE")
|
||||
|
|
|
@ -81,11 +81,8 @@ utf16le_is_mbc_newline(const UChar* p, const UChar* end,
|
|||
if (*p == 0x0a && *(p+1) == 0x00)
|
||||
return 1;
|
||||
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
|
||||
if ((
|
||||
#ifndef USE_CRNL_AS_LINE_TERMINATOR
|
||||
*p == 0x0d ||
|
||||
#endif
|
||||
*p == 0x85) && *(p+1) == 0x00)
|
||||
if ((*p == 0x0b || *p == 0x0c || *p == 0x0d || *p == 0x85)
|
||||
&& *(p+1) == 0x00)
|
||||
return 1;
|
||||
if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
|
||||
return 1;
|
||||
|
@ -245,5 +242,7 @@ OnigEncodingDefine(utf_16le, UTF_16LE) = {
|
|||
onigenc_unicode_is_code_ctype,
|
||||
onigenc_utf16_32_get_ctype_code_range,
|
||||
utf16le_left_adjust_char_head,
|
||||
onigenc_always_false_is_allowed_reverse_match
|
||||
onigenc_always_false_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_UNICODE,
|
||||
};
|
||||
|
|
|
@ -44,11 +44,7 @@ utf32be_is_mbc_newline(const UChar* p, const UChar* end,
|
|||
if (*(p+3) == 0x0a && *(p+2) == 0 && *(p+1) == 0 && *p == 0)
|
||||
return 1;
|
||||
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
|
||||
if ((
|
||||
#ifndef USE_CRNL_AS_LINE_TERMINATOR
|
||||
*(p+3) == 0x0d ||
|
||||
#endif
|
||||
*(p+3) == 0x85)
|
||||
if ((*(p+3) == 0x0b || *(p+3) == 0x0c || *(p+3) == 0x0d || *(p+3) == 0x85)
|
||||
&& *(p+2) == 0 && *(p+1) == 0 && *p == 0x00)
|
||||
return 1;
|
||||
if (*(p+2) == 0x20 && (*(p+3) == 0x29 || *(p+3) == 0x28)
|
||||
|
@ -159,7 +155,7 @@ utf32be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* e
|
|||
|
||||
if (s <= start) return (UChar* )s;
|
||||
|
||||
rem = (s - start) % 4;
|
||||
rem = (int )((s - start) % 4);
|
||||
return (UChar* )(s - rem);
|
||||
}
|
||||
|
||||
|
@ -189,7 +185,9 @@ OnigEncodingDefine(utf_32be, UTF_32BE) = {
|
|||
onigenc_unicode_is_code_ctype,
|
||||
onigenc_utf16_32_get_ctype_code_range,
|
||||
utf32be_left_adjust_char_head,
|
||||
onigenc_always_false_is_allowed_reverse_match
|
||||
onigenc_always_false_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_UNICODE,
|
||||
};
|
||||
ENC_ALIAS("UCS-4BE", "UTF-32BE")
|
||||
|
||||
|
|
|
@ -44,11 +44,7 @@ utf32le_is_mbc_newline(const UChar* p, const UChar* end,
|
|||
if (*p == 0x0a && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0)
|
||||
return 1;
|
||||
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
|
||||
if ((
|
||||
#ifndef USE_CRNL_AS_LINE_TERMINATOR
|
||||
*p == 0x0d ||
|
||||
#endif
|
||||
*p == 0x85)
|
||||
if ((*p == 0x0b ||*p == 0x0c ||*p == 0x0d || *p == 0x85)
|
||||
&& *(p+1) == 0x00 && (p+2) == 0x00 && *(p+3) == 0x00)
|
||||
return 1;
|
||||
if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)
|
||||
|
@ -159,7 +155,7 @@ utf32le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* e
|
|||
|
||||
if (s <= start) return (UChar* )s;
|
||||
|
||||
rem = (s - start) % 4;
|
||||
rem = (int )((s - start) % 4);
|
||||
return (UChar* )(s - rem);
|
||||
}
|
||||
|
||||
|
@ -189,6 +185,8 @@ OnigEncodingDefine(utf_32le, UTF_32LE) = {
|
|||
onigenc_unicode_is_code_ctype,
|
||||
onigenc_utf16_32_get_ctype_code_range,
|
||||
utf32le_left_adjust_char_head,
|
||||
onigenc_always_false_is_allowed_reverse_match
|
||||
onigenc_always_false_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_UNICODE,
|
||||
};
|
||||
ENC_ALIAS("UCS-4LE", "UTF-32LE")
|
||||
|
|
12
enc/utf_8.c
12
enc/utf_8.c
|
@ -248,9 +248,7 @@ is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
|
|||
if (*p == 0x0a) return 1;
|
||||
|
||||
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
|
||||
#ifndef USE_CRNL_AS_LINE_TERMINATOR
|
||||
if (*p == 0x0d) return 1;
|
||||
#endif
|
||||
if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1;
|
||||
if (p + 1 < end) {
|
||||
if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
|
||||
return 1;
|
||||
|
@ -272,7 +270,7 @@ mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
|
|||
int c, len;
|
||||
OnigCodePoint n;
|
||||
|
||||
len = enclen(enc, p, end);
|
||||
len = mbc_enc_len(p, end, enc);
|
||||
c = *p++;
|
||||
if (len > 1) {
|
||||
len--;
|
||||
|
@ -363,7 +361,7 @@ code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
|
|||
}
|
||||
|
||||
*p++ = UTF8_TRAIL0(code);
|
||||
return (int)(p - buf);
|
||||
return (int )(p - buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -440,7 +438,9 @@ OnigEncodingDefine(utf_8, UTF_8) = {
|
|||
onigenc_unicode_is_code_ctype,
|
||||
get_ctype_code_range,
|
||||
left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_UNICODE,
|
||||
};
|
||||
ENC_ALIAS("CP65001", "UTF-8")
|
||||
|
||||
|
|
|
@ -196,7 +196,9 @@ OnigEncodingDefine(windows_1251, Windows_1251) = {
|
|||
cp1251_is_code_ctype,
|
||||
onigenc_not_support_get_ctype_code_range,
|
||||
onigenc_single_byte_left_adjust_char_head,
|
||||
onigenc_always_true_is_allowed_reverse_match
|
||||
onigenc_always_true_is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
/*
|
||||
* Name: windows-1251
|
||||
|
|
80
enc/windows_31j.c
Normal file
80
enc/windows_31j.c
Normal file
|
@ -0,0 +1,80 @@
|
|||
/**********************************************************************
|
||||
cp932.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2009 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define ENC_CP932
|
||||
#include "shift_jis.c"
|
||||
|
||||
OnigEncodingDefine(windows_31j, Windows_31J) = {
|
||||
mbc_enc_len,
|
||||
"Windows-31J", /* name */
|
||||
2, /* max byte length */
|
||||
1, /* min byte length */
|
||||
onigenc_is_mbc_newline_0x0a,
|
||||
mbc_to_code,
|
||||
code_to_mbclen,
|
||||
code_to_mbc,
|
||||
mbc_case_fold,
|
||||
apply_all_case_fold,
|
||||
get_case_fold_codes_by_str,
|
||||
property_name_to_ctype,
|
||||
is_code_ctype,
|
||||
get_ctype_code_range,
|
||||
left_adjust_char_head,
|
||||
is_allowed_reverse_match,
|
||||
0,
|
||||
ONIGENC_FLAG_NONE,
|
||||
};
|
||||
/*
|
||||
* Name: Windows-31J
|
||||
* MIBenum: 2024
|
||||
* Link: http://www.iana.org/assignments/character-sets
|
||||
* Link: http://www.microsoft.com/globaldev/reference/dbcs/932.mspx
|
||||
* Link: http://ja.wikipedia.org/wiki/Windows-31J
|
||||
* Link: http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-932-2000.ucm
|
||||
*
|
||||
* Windows Standard Character Set and its mapping to Unicode by Microsoft.
|
||||
* Since 1.9.3, SJIS is the alias of Windows-31J because its character
|
||||
* set is usually this one even if its mapping may differ.
|
||||
*/
|
||||
ENC_ALIAS("CP932", "Windows-31J")
|
||||
ENC_ALIAS("csWindows31J", "Windows-31J") /* IANA. IE6 don't accept Windows-31J but csWindows31J. */
|
||||
ENC_ALIAS("SJIS", "Windows-31J")
|
||||
|
||||
/*
|
||||
* Name: PCK
|
||||
* Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/x-2chn0/index.html
|
||||
* Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/appb-pckwarn-1/index.html
|
||||
*
|
||||
* Solaris's SJIS variant. Its set is Windows Standard Character Set; it
|
||||
* consists JIS X 0201 Latin (US-ASCII), JIS X 0201 Katakana, JIS X 0208, NEC
|
||||
* special characters, NEC-selected IBM extended characters, and IBM extended
|
||||
* characters. Solaris's iconv seems to use SJIS-open.
|
||||
*/
|
||||
ENC_ALIAS("PCK", "Windows-31J")
|
|
@ -1,10 +1,11 @@
|
|||
#ifndef ONIGURUMA_H
|
||||
#define ONIGURUMA_H
|
||||
/**********************************************************************
|
||||
oniguruma.h - Oniguruma (regular expression library)
|
||||
oniguruma.h - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2002-2009 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011-2012 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -38,8 +39,8 @@ extern "C" {
|
|||
|
||||
#define ONIGURUMA
|
||||
#define ONIGURUMA_VERSION_MAJOR 5
|
||||
#define ONIGURUMA_VERSION_MINOR 9
|
||||
#define ONIGURUMA_VERSION_TEENY 2
|
||||
#define ONIGURUMA_VERSION_MINOR 13
|
||||
#define ONIGURUMA_VERSION_TEENY 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
# ifndef HAVE_PROTOTYPES
|
||||
|
@ -101,6 +102,8 @@ extern "C" {
|
|||
#pragma GCC visibility push(default)
|
||||
#endif
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
/* PART: character encoding */
|
||||
|
||||
#ifndef ONIG_ESCAPE_UCHAR_COLLISION
|
||||
|
@ -108,9 +111,10 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
typedef unsigned char OnigUChar;
|
||||
typedef unsigned int OnigCodePoint;
|
||||
typedef unsigned int OnigCodePoint;
|
||||
typedef unsigned int OnigCtype;
|
||||
typedef size_t OnigDistance;
|
||||
typedef ptrdiff_t OnigPosition;
|
||||
|
||||
#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0)
|
||||
|
||||
|
@ -171,6 +175,7 @@ typedef struct OnigEncodingTypeST {
|
|||
OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p, const OnigUChar* end, struct OnigEncodingTypeST* enc);
|
||||
int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end, struct OnigEncodingTypeST* enc);
|
||||
int ruby_encoding_index;
|
||||
unsigned int flags;
|
||||
} OnigEncodingType;
|
||||
|
||||
typedef OnigEncodingType* OnigEncoding;
|
||||
|
@ -204,17 +209,12 @@ ONIG_EXTERN OnigEncodingType OnigEncodingASCII;
|
|||
#define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */
|
||||
#define ONIGENC_CTYPE_ASCII 14
|
||||
#define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII
|
||||
#define ONIGENC_CTYPE_SPECIAL_MASK 256
|
||||
#define ONIGENC_CTYPE_S /* [\t\n\v\f\r\s] */ \
|
||||
ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_SPACE
|
||||
#define ONIGENC_CTYPE_D /* [0-9] */ \
|
||||
ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_DIGIT
|
||||
#define ONIGENC_CTYPE_W /* [0-9A-Za-z_] */ \
|
||||
ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_WORD
|
||||
#define ONIGENC_CTYPE_SPECIAL_P(ctype) ((ctype) & ONIGENC_CTYPE_SPECIAL_MASK)
|
||||
|
||||
/* flags */
|
||||
#define ONIGENC_FLAG_NONE 0U
|
||||
#define ONIGENC_FLAG_UNICODE 1U
|
||||
|
||||
#define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e)
|
||||
#define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e)
|
||||
|
||||
#define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF)
|
||||
#define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1)
|
||||
|
@ -223,6 +223,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingASCII;
|
|||
#define ONIGENC_IS_CODE_ASCII(code) ((code) < 128)
|
||||
#define ONIGENC_IS_MBC_WORD(enc,s,end) \
|
||||
ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE(enc,s,end))
|
||||
#define ONIGENC_IS_MBC_ASCII_WORD(enc,s,end) \
|
||||
onigenc_ascii_is_code_ctype( \
|
||||
ONIGENC_MBC_TO_CODE(enc,s,end),ONIGENC_CTYPE_WORD,enc)
|
||||
#define ONIGENC_IS_UNICODE(enc) ((enc)->flags & ONIGENC_FLAG_UNICODE)
|
||||
|
||||
|
||||
#define ONIGENC_NAME(enc) ((enc)->name)
|
||||
|
@ -350,6 +354,7 @@ typedef unsigned int OnigOptionType;
|
|||
#define ONIG_OPTION_IGNORECASE 1U
|
||||
#define ONIG_OPTION_EXTEND (ONIG_OPTION_IGNORECASE << 1)
|
||||
#define ONIG_OPTION_MULTILINE (ONIG_OPTION_EXTEND << 1)
|
||||
#define ONIG_OPTION_DOTALL ONIG_OPTION_MULTILINE
|
||||
#define ONIG_OPTION_SINGLELINE (ONIG_OPTION_MULTILINE << 1)
|
||||
#define ONIG_OPTION_FIND_LONGEST (ONIG_OPTION_SINGLELINE << 1)
|
||||
#define ONIG_OPTION_FIND_NOT_EMPTY (ONIG_OPTION_FIND_LONGEST << 1)
|
||||
|
@ -360,7 +365,13 @@ typedef unsigned int OnigOptionType;
|
|||
#define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1)
|
||||
#define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1)
|
||||
#define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1)
|
||||
#define ONIG_OPTION_MAXBIT ONIG_OPTION_POSIX_REGION /* limit */
|
||||
/* options (ctype range) */
|
||||
#define ONIG_OPTION_ASCII_RANGE (ONIG_OPTION_POSIX_REGION << 1)
|
||||
#define ONIG_OPTION_POSIX_BRACKET_ALL_RANGE (ONIG_OPTION_ASCII_RANGE << 1)
|
||||
#define ONIG_OPTION_WORD_BOUND_ALL_RANGE (ONIG_OPTION_POSIX_BRACKET_ALL_RANGE << 1)
|
||||
/* options (newline) */
|
||||
#define ONIG_OPTION_NEWLINE_CRLF (ONIG_OPTION_WORD_BOUND_ALL_RANGE << 1)
|
||||
#define ONIG_OPTION_MAXBIT ONIG_OPTION_NEWLINE_CRLF /* limit */
|
||||
|
||||
#define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt))
|
||||
#define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt))
|
||||
|
@ -382,9 +393,11 @@ ONIG_EXTERN const OnigSyntaxType OnigSyntaxEmacs;
|
|||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxGrep;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxGnuRegex;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxJava;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl58;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl58_NG;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl_NG;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxRuby;
|
||||
ONIG_EXTERN const OnigSyntaxType OnigSyntaxPython;
|
||||
|
||||
/* predefined syntaxes (see regsyntax.c) */
|
||||
#define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS)
|
||||
|
@ -394,9 +407,11 @@ ONIG_EXTERN const OnigSyntaxType OnigSyntaxRuby;
|
|||
#define ONIG_SYNTAX_GREP (&OnigSyntaxGrep)
|
||||
#define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex)
|
||||
#define ONIG_SYNTAX_JAVA (&OnigSyntaxJava)
|
||||
#define ONIG_SYNTAX_PERL58 (&OnigSyntaxPerl58)
|
||||
#define ONIG_SYNTAX_PERL58_NG (&OnigSyntaxPerl58_NG)
|
||||
#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl)
|
||||
#define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG)
|
||||
#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby)
|
||||
#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython)
|
||||
|
||||
/* default syntax */
|
||||
ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
||||
|
@ -434,11 +449,12 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
#define ONIG_SYN_OP_ESC_OCTAL3 (1U<<28) /* \OOO */
|
||||
#define ONIG_SYN_OP_ESC_X_HEX2 (1U<<29) /* \xHH */
|
||||
#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1U<<30) /* \x{7HHHHHHH} */
|
||||
#define ONIG_SYN_OP_ESC_O_BRACE_OCTAL (1U<<31) /* \o{OOO} */ /* NOTIMPL */
|
||||
|
||||
#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1U<<0) /* \Q...\E */
|
||||
#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1U<<1) /* (?...) */
|
||||
#define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsx),(?-imsx) */
|
||||
#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imx), (?-imx) */
|
||||
#define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsxadlu), (?-imsx), (?^imsxalu) */
|
||||
#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imxadu), (?-imx) */
|
||||
#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1U<<4) /* ?+,*+,++ */
|
||||
#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1U<<5) /* {n,m}+ */
|
||||
#define ONIG_SYN_OP2_CCLASS_SET_OP (1U<<6) /* [...&&..[..]..] */
|
||||
|
@ -456,6 +472,17 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
/* #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1U<<18) */
|
||||
#define ONIG_SYN_OP2_ESC_H_XDIGIT (1U<<19) /* \h, \H */
|
||||
#define ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (1U<<20) /* \ */
|
||||
#define ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK (1U<<21) /* \R as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */
|
||||
#define ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER (1U<<22) /* \X as (?>\P{M}\p{M}*) */
|
||||
#define ONIG_SYN_OP2_ESC_V_VERTICAL_WHITESPACE (1U<<23) /* \v, \V -- Perl */ /* NOTIMPL */
|
||||
#define ONIG_SYN_OP2_ESC_H_HORIZONTAL_WHITESPACE (1U<<24) /* \h, \H -- Perl */ /* NOTIMPL */
|
||||
#define ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP (1U<<25) /* \K */
|
||||
#define ONIG_SYN_OP2_ESC_G_BRACE_BACKREF (1U<<26) /* \g{name}, \g{n} */
|
||||
#define ONIG_SYN_OP2_QMARK_SUBEXP_CALL (1U<<27) /* (?&name), (?n), (?R), (?0) */
|
||||
#define ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET (1U<<28) /* (?|...) */ /* NOTIMPL */
|
||||
#define ONIG_SYN_OP2_QMARK_LPAREN_CONDITION (1U<<29) /* (?(cond)yes...|no...) */
|
||||
#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP (1U<<30) /* (?P<name>...), (?P=name), (?P>name) -- Python/PCRE */
|
||||
#define ONIG_SYN_OP2_OPTION_JAVA (1U<<31) /* (?idmsux), (?-idmsux) */ /* NOTIMPL */
|
||||
|
||||
/* syntax (behavior) */
|
||||
#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */
|
||||
|
@ -469,6 +496,7 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1U<<7) /* see doc/RE */
|
||||
#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?<x>)(?<x>) */
|
||||
#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */
|
||||
#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL (1U<<10) /* (?<x>)(?<x>)(?&x) */
|
||||
|
||||
/* syntax (behavior) in char class [...] */
|
||||
#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */
|
||||
|
@ -505,7 +533,7 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
#define ONIGERR_UNDEFINED_BYTECODE -13
|
||||
#define ONIGERR_UNEXPECTED_BYTECODE -14
|
||||
#define ONIGERR_MATCH_STACK_LIMIT_OVER -15
|
||||
#define ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED -21
|
||||
#define ONIGERR_DEFAULT_ENCODING_IS_NOT_SET -21
|
||||
#define ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22
|
||||
/* general error */
|
||||
#define ONIGERR_INVALID_ARGUMENT -30
|
||||
|
@ -532,6 +560,7 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
#define ONIGERR_INVALID_POSIX_BRACKET_TYPE -121
|
||||
#define ONIGERR_INVALID_LOOK_BEHIND_PATTERN -122
|
||||
#define ONIGERR_INVALID_REPEAT_RANGE_PATTERN -123
|
||||
#define ONIGERR_INVALID_CONDITION_PATTERN -124
|
||||
/* values error (syntax error) */
|
||||
#define ONIGERR_TOO_BIG_NUMBER -200
|
||||
#define ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201
|
||||
|
@ -543,6 +572,7 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
#define ONIGERR_TOO_BIG_BACKREF_NUMBER -207
|
||||
#define ONIGERR_INVALID_BACKREF -208
|
||||
#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209
|
||||
#define ONIGERR_TOO_SHORT_DIGITS -210
|
||||
#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212
|
||||
#define ONIGERR_EMPTY_GROUP_NAME -214
|
||||
#define ONIGERR_INVALID_GROUP_NAME -215
|
||||
|
@ -571,8 +601,8 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax;
|
|||
|
||||
typedef struct OnigCaptureTreeNodeStruct {
|
||||
int group; /* group number */
|
||||
int beg;
|
||||
int end;
|
||||
OnigPosition beg;
|
||||
OnigPosition end;
|
||||
int allocated;
|
||||
int num_childs;
|
||||
struct OnigCaptureTreeNodeStruct** childs;
|
||||
|
@ -582,8 +612,8 @@ typedef struct OnigCaptureTreeNodeStruct {
|
|||
struct re_registers {
|
||||
int allocated;
|
||||
int num_regs;
|
||||
int* beg;
|
||||
int* end;
|
||||
OnigPosition* beg;
|
||||
OnigPosition* end;
|
||||
/* extended */
|
||||
OnigCaptureTreeNode* history_root; /* capture history tree root */
|
||||
};
|
||||
|
@ -689,7 +719,7 @@ typedef struct {
|
|||
ONIG_EXTERN
|
||||
int onig_init P_((void));
|
||||
ONIG_EXTERN
|
||||
int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...));
|
||||
int onig_error_code_to_str PV_((OnigUChar* s, OnigPosition err_code, ...));
|
||||
ONIG_EXTERN
|
||||
void onig_set_warn_func P_((OnigWarnFunc f));
|
||||
ONIG_EXTERN
|
||||
|
@ -697,7 +727,7 @@ void onig_set_verb_warn_func P_((OnigWarnFunc f));
|
|||
ONIG_EXTERN
|
||||
int onig_new P_((OnigRegex*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, OnigErrorInfo* einfo));
|
||||
ONIG_EXTERN
|
||||
int onig_reg_init P_((regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType* syntax));
|
||||
int onig_reg_init P_((OnigRegex reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType* syntax));
|
||||
ONIG_EXTERN
|
||||
int onig_new_without_alloc P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo));
|
||||
ONIG_EXTERN
|
||||
|
@ -711,9 +741,11 @@ int onig_recompile P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pat
|
|||
ONIG_EXTERN
|
||||
int onig_recompile_deluxe P_((OnigRegex reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo));
|
||||
ONIG_EXTERN
|
||||
long onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option));
|
||||
OnigPosition onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option));
|
||||
ONIG_EXTERN
|
||||
long onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option));
|
||||
OnigPosition onig_search_gpos P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* global_pos, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option));
|
||||
ONIG_EXTERN
|
||||
OnigPosition onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option));
|
||||
ONIG_EXTERN
|
||||
OnigRegion* onig_region_new P_((void));
|
||||
ONIG_EXTERN
|
||||
|
@ -743,7 +775,7 @@ int onig_number_of_capture_histories P_((OnigRegex reg));
|
|||
ONIG_EXTERN
|
||||
OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region));
|
||||
ONIG_EXTERN
|
||||
int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg));
|
||||
int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,OnigPosition,OnigPosition,int,int,void*), void* arg));
|
||||
ONIG_EXTERN
|
||||
int onig_noname_group_capture_is_active P_((OnigRegex reg));
|
||||
ONIG_EXTERN
|
||||
|
|
32
regenc.c
32
regenc.c
|
@ -1,8 +1,9 @@
|
|||
/**********************************************************************
|
||||
regenc.c - Oniguruma (regular expression library)
|
||||
regenc.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -66,7 +67,7 @@ onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const U
|
|||
{
|
||||
UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
|
||||
if (p < s) {
|
||||
p += enclen(enc, p, end);
|
||||
p += enclen(enc, p, end);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
@ -760,7 +761,7 @@ onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
|
|||
if (enclen(enc, buf, p) != (p - buf))
|
||||
return ONIGERR_INVALID_CODE_POINT_VALUE;
|
||||
#endif
|
||||
return (int)(p - buf);
|
||||
return (int )(p - buf);
|
||||
}
|
||||
|
||||
extern int
|
||||
|
@ -783,7 +784,7 @@ onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
|
|||
if (enclen(enc, buf, p) != (p - buf))
|
||||
return ONIGERR_INVALID_CODE_POINT_VALUE;
|
||||
#endif
|
||||
return (int)(p - buf);
|
||||
return (int )(p - buf);
|
||||
}
|
||||
|
||||
extern int
|
||||
|
@ -812,7 +813,7 @@ onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
|
|||
len = onigenc_strlen(enc, p, end);
|
||||
for (pbe = (pb = PBS) + sizeof(PBS)/sizeof(PBS[0]); pb < pbe; ++pb) {
|
||||
if (len == pb->len &&
|
||||
STRNCASECMP((char *)p, (char *)pb->name, len) == 0)
|
||||
onigenc_with_ascii_strnicmp(enc, p, end, pb->name, pb->len) == 0)
|
||||
return pb->ctype;
|
||||
}
|
||||
|
||||
|
@ -868,6 +869,27 @@ onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int
|
||||
onigenc_with_ascii_strnicmp(OnigEncoding enc, const UChar* p, const UChar* end,
|
||||
const UChar* sascii /* ascii */, int n)
|
||||
{
|
||||
int x, c;
|
||||
|
||||
while (n-- > 0) {
|
||||
if (p >= end) return (int )(*sascii);
|
||||
|
||||
c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
|
||||
if (ONIGENC_IS_ASCII_CODE(c))
|
||||
c = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c);
|
||||
x = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*sascii) - c;
|
||||
if (x) return x;
|
||||
|
||||
sascii++;
|
||||
p += enclen(enc, p, end);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Property management */
|
||||
static int
|
||||
resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
|
||||
|
|
14
regenc.h
14
regenc.h
|
@ -1,10 +1,11 @@
|
|||
#ifndef ONIGURUMA_REGENC_H
|
||||
#define ONIGURUMA_REGENC_H
|
||||
/**********************************************************************
|
||||
regenc.h - Oniguruma (regular expression library)
|
||||
regenc.h - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -107,10 +108,10 @@ typedef struct {
|
|||
|
||||
#define PosixBracketEntryInit(name, ctype) {(const UChar *)name, ctype, (short int)(sizeof(name) - 1)}
|
||||
|
||||
/* #define USE_CRNL_AS_LINE_TERMINATOR */
|
||||
#define USE_CRNL_AS_LINE_TERMINATOR
|
||||
#define USE_UNICODE_PROPERTIES
|
||||
/* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */
|
||||
/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTF#18 */
|
||||
/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTS #18 */
|
||||
|
||||
|
||||
#define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII
|
||||
|
@ -170,6 +171,8 @@ ONIG_EXTERN const UChar OnigEncISO_8859_1_ToUpperCaseTable[];
|
|||
|
||||
ONIG_EXTERN int
|
||||
onigenc_with_ascii_strncmp P_((OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n));
|
||||
ONIG_EXTERN int
|
||||
onigenc_with_ascii_strnicmp P_((OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n));
|
||||
ONIG_EXTERN UChar*
|
||||
onigenc_step P_((OnigEncoding enc, const UChar* p, const UChar* end, int n));
|
||||
|
||||
|
@ -190,6 +193,11 @@ ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[];
|
|||
(ONIGENC_IS_ASCII_CODE_CTYPE(code, ONIGENC_CTYPE_UPPER) ||\
|
||||
ONIGENC_IS_ASCII_CODE_CTYPE(code, ONIGENC_CTYPE_LOWER))
|
||||
|
||||
/* Check if the code is in the range. (from <= code && code <= to) */
|
||||
#define ONIGENC_IS_IN_RANGE(code, from, to) \
|
||||
((OnigCodePoint )((code) - (from)) <= (OnigCodePoint )((to) - (from)))
|
||||
|
||||
|
||||
#ifdef ONIG_ENC_REGISTER
|
||||
extern int ONIG_ENC_REGISTER(const char *, OnigEncodingType*);
|
||||
#define OnigEncodingName(n) encoding_##n
|
||||
|
|
23
regerror.c
23
regerror.c
|
@ -1,8 +1,9 @@
|
|||
/**********************************************************************
|
||||
regerror.c - Oniguruma (regular expression library)
|
||||
regerror.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -39,7 +40,7 @@
|
|||
#endif
|
||||
|
||||
extern UChar*
|
||||
onig_error_code_to_format(int code)
|
||||
onig_error_code_to_format(OnigPosition code)
|
||||
{
|
||||
const char *p;
|
||||
|
||||
|
@ -64,8 +65,8 @@ onig_error_code_to_format(int code)
|
|||
p = "undefined bytecode (bug)"; break;
|
||||
case ONIGERR_UNEXPECTED_BYTECODE:
|
||||
p = "unexpected bytecode (bug)"; break;
|
||||
case ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED:
|
||||
p = "default multibyte-encoding is not setted"; break;
|
||||
case ONIGERR_DEFAULT_ENCODING_IS_NOT_SET:
|
||||
p = "default multibyte-encoding is not set"; break;
|
||||
case ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR:
|
||||
p = "can't convert to wide-char on specified multibyte-encoding"; break;
|
||||
case ONIGERR_INVALID_ARGUMENT:
|
||||
|
@ -114,6 +115,8 @@ onig_error_code_to_format(int code)
|
|||
p = "invalid pattern in look-behind"; break;
|
||||
case ONIGERR_INVALID_REPEAT_RANGE_PATTERN:
|
||||
p = "invalid repeat range {lower,upper}"; break;
|
||||
case ONIGERR_INVALID_CONDITION_PATTERN:
|
||||
p = "invalid condition pattern"; break;
|
||||
case ONIGERR_TOO_BIG_NUMBER:
|
||||
p = "too big number"; break;
|
||||
case ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE:
|
||||
|
@ -140,6 +143,8 @@ onig_error_code_to_format(int code)
|
|||
p = "numbered backref/call is not allowed. (use name)"; break;
|
||||
case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
|
||||
p = "too big wide-char value"; break;
|
||||
case ONIGERR_TOO_SHORT_DIGITS:
|
||||
p = "too short digits"; break;
|
||||
case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE:
|
||||
p = "too long wide-char value"; break;
|
||||
case ONIGERR_INVALID_CODE_POINT_VALUE:
|
||||
|
@ -232,7 +237,7 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end,
|
|||
*is_over = ((p < end) ? 1 : 0);
|
||||
}
|
||||
else {
|
||||
len = (int)MIN((end - s), buf_size);
|
||||
len = (int )MIN((end - s), buf_size);
|
||||
xmemcpy(buf, s, (size_t )len);
|
||||
*is_over = ((buf_size < (end - s)) ? 1 : 0);
|
||||
}
|
||||
|
@ -246,11 +251,11 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end,
|
|||
|
||||
extern int
|
||||
#ifdef HAVE_STDARG_PROTOTYPES
|
||||
onig_error_code_to_str(UChar* s, int code, ...)
|
||||
onig_error_code_to_str(UChar* s, OnigPosition code, ...)
|
||||
#else
|
||||
onig_error_code_to_str(s, code, va_alist)
|
||||
UChar* s;
|
||||
int code;
|
||||
OnigPosition code;
|
||||
va_dcl
|
||||
#endif
|
||||
{
|
||||
|
@ -309,7 +314,7 @@ onig_error_code_to_str(s, code, va_alist)
|
|||
}
|
||||
|
||||
va_end(vargs);
|
||||
return (int)len;
|
||||
return (int )len;
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -325,7 +330,7 @@ onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc,
|
|||
|
||||
need = (pat_end - pat) * 4 + 4;
|
||||
|
||||
if (n + need < (size_t)bufsize) {
|
||||
if (n + need < (size_t )bufsize) {
|
||||
strcat((char* )buf, ": /");
|
||||
s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf);
|
||||
|
||||
|
|
101
regint.h
101
regint.h
|
@ -1,10 +1,11 @@
|
|||
#ifndef ONIGURUMA_REGINT_H
|
||||
#define ONIGURUMA_REGINT_H
|
||||
/**********************************************************************
|
||||
regint.h - Oniguruma (regular expression library)
|
||||
regint.h - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011-2012 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -57,12 +58,15 @@
|
|||
/* spec. config */
|
||||
#define USE_NAMED_GROUP
|
||||
#define USE_SUBEXP_CALL
|
||||
#define USE_PERL_SUBEXP_CALL
|
||||
#define USE_CAPITAL_P_NAMED_GROUP
|
||||
#define USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> */
|
||||
#define USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT /* /(?:()|())*\2/ */
|
||||
#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */
|
||||
#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
|
||||
/* #define USE_RECOMPILE_API */
|
||||
/* !!! moved to regenc.h. */ /* #define USE_CRNL_AS_LINE_TERMINATOR */
|
||||
#define USE_NO_INVALID_QUANTIFIER
|
||||
|
||||
/* internal config */
|
||||
#define USE_PARSE_TREE_NODE_RECYCLE
|
||||
|
@ -70,10 +74,18 @@
|
|||
#define USE_QTFR_PEEK_NEXT
|
||||
#define USE_ST_LIBRARY
|
||||
#define USE_SHARED_CCLASS_TABLE
|
||||
#define USE_SUNDAY_QUICK_SEARCH
|
||||
|
||||
#define INIT_MATCH_STACK_SIZE 160
|
||||
#define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */
|
||||
|
||||
/* check config */
|
||||
#if defined(USE_PERL_SUBEXP_CALL) || defined(USE_CAPITAL_P_NAMED_GROUP)
|
||||
#if !defined(USE_NAMED_GROUP) || !defined(USE_SUBEXP_CALL)
|
||||
#error USE_NAMED_GROUP and USE_SUBEXP_CALL must be defined.
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__)
|
||||
# define ARG_UNUSED __attribute__ ((unused))
|
||||
#else
|
||||
|
@ -92,13 +104,14 @@
|
|||
#ifdef ONIG_ESCAPE_UCHAR_COLLISION
|
||||
#undef ONIG_ESCAPE_UCHAR_COLLISION
|
||||
#endif
|
||||
#define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */
|
||||
#undef USE_MATCH_RANGE_IS_COMPLETE_RANGE
|
||||
#undef USE_CAPTURE_HISTORY
|
||||
#define USE_VARIABLE_META_CHARS
|
||||
#define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */
|
||||
#define USE_POSIX_REGION_OPTION /* needed for POSIX API support */
|
||||
#define USE_POSIX_API_REGION_OPTION /* needed for POSIX API support */
|
||||
#define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
|
||||
/* #define USE_COMBINATION_EXPLOSION_CHECK */ /* (X*)* */
|
||||
|
||||
/* #define USE_MULTI_THREAD_SYSTEM */
|
||||
#define THREAD_SYSTEM_INIT /* depend on thread system */
|
||||
#define THREAD_SYSTEM_END /* depend on thread system */
|
||||
|
@ -218,10 +231,35 @@
|
|||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
#endif
|
||||
|
||||
#ifdef STDC_HEADERS
|
||||
# include <stddef.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BORLANDC__
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#ifdef ONIG_DEBUG
|
||||
# include <stdio.h>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1300)
|
||||
#ifndef _INTPTR_T_DEFINED
|
||||
#define _INTPTR_T_DEFINED
|
||||
typedef int intptr_t;
|
||||
#endif
|
||||
#ifndef _UINTPTR_T_DEFINED
|
||||
#define _UINTPTR_T_DEFINED
|
||||
typedef unsigned int uintptr_t;
|
||||
#endif
|
||||
#endif
|
||||
#endif /* _WIN32 */
|
||||
|
||||
#include "regenc.h"
|
||||
|
||||
#if defined __GNUC__ && __GNUC__ >= 4
|
||||
|
@ -243,6 +281,8 @@
|
|||
#define CHECK_NULL_RETURN_MEMERR(p) if (IS_NULL(p)) return ONIGERR_MEMORY
|
||||
#define NULL_UCHARP ((UChar* )0)
|
||||
|
||||
#define ONIG_LAST_CODE_POINT (~((OnigCodePoint )0))
|
||||
|
||||
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
|
||||
|
||||
#define PLATFORM_GET_INC(val,p,type) do{\
|
||||
|
@ -282,9 +322,11 @@
|
|||
#define ONIG_OPTIMIZE_NONE 0
|
||||
#define ONIG_OPTIMIZE_EXACT 1 /* Slow Search */
|
||||
#define ONIG_OPTIMIZE_EXACT_BM 2 /* Boyer Moore Search */
|
||||
#define ONIG_OPTIMIZE_EXACT_BM_NOT_REV 3 /* BM (but not simple match) */
|
||||
#define ONIG_OPTIMIZE_EXACT_BM_NOT_REV 3 /* BM (applied to a multibyte string) */
|
||||
#define ONIG_OPTIMIZE_EXACT_IC 4 /* Slow Search (ignore case) */
|
||||
#define ONIG_OPTIMIZE_MAP 5 /* char map */
|
||||
#define ONIG_OPTIMIZE_EXACT_BM_IC 6 /* BM (ignore case) */
|
||||
#define ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC 7 /* BM (applied to a multibyte string) (ignore case) */
|
||||
|
||||
/* bit status */
|
||||
typedef unsigned int BitStatusType;
|
||||
|
@ -327,6 +369,10 @@ typedef unsigned int BitStatusType;
|
|||
#define IS_NOTBOL(option) ((option) & ONIG_OPTION_NOTBOL)
|
||||
#define IS_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL)
|
||||
#define IS_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION)
|
||||
#define IS_ASCII_RANGE(option) ((option) & ONIG_OPTION_ASCII_RANGE)
|
||||
#define IS_POSIX_BRACKET_ALL_RANGE(option) ((option) & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE)
|
||||
#define IS_WORD_BOUND_ALL_RANGE(option) ((option) & ONIG_OPTION_WORD_BOUND_ALL_RANGE)
|
||||
#define IS_NEWLINE_CRLF(option) ((option) & ONIG_OPTION_NEWLINE_CRLF)
|
||||
|
||||
/* OP_SET_OPTION is required for these options.
|
||||
#define IS_DYNAMIC_OPTION(option) \
|
||||
|
@ -355,7 +401,7 @@ typedef unsigned char Bits;
|
|||
typedef Bits BitSet[BITSET_SIZE];
|
||||
typedef Bits* BitSetRef;
|
||||
|
||||
#define SIZE_BITSET (int)sizeof(BitSet)
|
||||
#define SIZE_BITSET (int )sizeof(BitSet)
|
||||
|
||||
#define BITSET_CLEAR(bs) do {\
|
||||
int i;\
|
||||
|
@ -402,7 +448,7 @@ typedef struct _BBuf {
|
|||
} while (0)
|
||||
|
||||
#define BBUF_WRITE(buf,pos,bytes,n) do{\
|
||||
int used = (pos) + (int)(n);\
|
||||
int used = (pos) + (int )(n);\
|
||||
if ((buf)->alloc < (unsigned int )used) BBUF_EXPAND((buf),used);\
|
||||
xmemcpy((buf)->p + (pos), (bytes), (n));\
|
||||
if ((buf)->used < (unsigned int )used) (buf)->used = used;\
|
||||
|
@ -470,6 +516,8 @@ typedef struct _BBuf {
|
|||
#define ANCHOR_ANYCHAR_STAR (1<<14) /* ".*" optimize info */
|
||||
#define ANCHOR_ANYCHAR_STAR_ML (1<<15) /* ".*" optimize info (multi-line) */
|
||||
|
||||
#define ANCHOR_KEEP (1<<16)
|
||||
|
||||
/* operation code */
|
||||
enum OpCode {
|
||||
OP_FINISH = 0, /* matching process terminator (no more alternative) */
|
||||
|
@ -513,12 +561,20 @@ enum OpCode {
|
|||
OP_WORD_BEGIN,
|
||||
OP_WORD_END,
|
||||
|
||||
OP_ASCII_WORD,
|
||||
OP_NOT_ASCII_WORD,
|
||||
OP_ASCII_WORD_BOUND,
|
||||
OP_NOT_ASCII_WORD_BOUND,
|
||||
OP_ASCII_WORD_BEGIN,
|
||||
OP_ASCII_WORD_END,
|
||||
|
||||
OP_BEGIN_BUF,
|
||||
OP_END_BUF,
|
||||
OP_BEGIN_LINE,
|
||||
OP_END_LINE,
|
||||
OP_SEMI_END_BUF,
|
||||
OP_BEGIN_POSITION,
|
||||
OP_BEGIN_POS_OR_LINE, /* used for implicit anchor optimization */
|
||||
|
||||
OP_BACKREF1,
|
||||
OP_BACKREF2,
|
||||
|
@ -535,6 +591,8 @@ enum OpCode {
|
|||
OP_MEMORY_END,
|
||||
OP_MEMORY_END_REC, /* push marker to stack */
|
||||
|
||||
OP_KEEP,
|
||||
|
||||
OP_FAIL, /* pop stack and move */
|
||||
OP_JUMP,
|
||||
OP_PUSH,
|
||||
|
@ -565,6 +623,8 @@ enum OpCode {
|
|||
OP_CALL, /* \g<name> */
|
||||
OP_RETURN,
|
||||
|
||||
OP_CONDITION,
|
||||
|
||||
OP_STATE_CHECK_PUSH, /* combination explosion check and push */
|
||||
OP_STATE_CHECK_PUSH_OR_JUMP, /* check ok -> push, else jump */
|
||||
OP_STATE_CHECK, /* check only */
|
||||
|
@ -585,15 +645,15 @@ typedef short int StateCheckNumType;
|
|||
typedef void* PointerType;
|
||||
|
||||
#define SIZE_OPCODE 1
|
||||
#define SIZE_RELADDR (int)sizeof(RelAddrType)
|
||||
#define SIZE_ABSADDR (int)sizeof(AbsAddrType)
|
||||
#define SIZE_LENGTH (int)sizeof(LengthType)
|
||||
#define SIZE_MEMNUM (int)sizeof(MemNumType)
|
||||
#define SIZE_STATE_CHECK_NUM (int)sizeof(StateCheckNumType)
|
||||
#define SIZE_REPEATNUM (int)sizeof(RepeatNumType)
|
||||
#define SIZE_OPTION (int)sizeof(OnigOptionType)
|
||||
#define SIZE_CODE_POINT (int)sizeof(OnigCodePoint)
|
||||
#define SIZE_POINTER (int)sizeof(PointerType)
|
||||
#define SIZE_RELADDR (int )sizeof(RelAddrType)
|
||||
#define SIZE_ABSADDR (int )sizeof(AbsAddrType)
|
||||
#define SIZE_LENGTH (int )sizeof(LengthType)
|
||||
#define SIZE_MEMNUM (int )sizeof(MemNumType)
|
||||
#define SIZE_STATE_CHECK_NUM (int )sizeof(StateCheckNumType)
|
||||
#define SIZE_REPEATNUM (int )sizeof(RepeatNumType)
|
||||
#define SIZE_OPTION (int )sizeof(OnigOptionType)
|
||||
#define SIZE_CODE_POINT (int )sizeof(OnigCodePoint)
|
||||
#define SIZE_POINTER (int )sizeof(PointerType)
|
||||
|
||||
|
||||
#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType)
|
||||
|
@ -645,6 +705,7 @@ typedef void* PointerType;
|
|||
#define SIZE_OP_FAIL_LOOK_BEHIND_NOT SIZE_OPCODE
|
||||
#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR)
|
||||
#define SIZE_OP_RETURN SIZE_OPCODE
|
||||
#define SIZE_OP_CONDITION (SIZE_OPCODE + SIZE_MEMNUM + SIZE_RELADDR)
|
||||
|
||||
#ifdef USE_COMBINATION_EXPLOSION_CHECK
|
||||
#define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM)
|
||||
|
@ -731,6 +792,7 @@ typedef struct _OnigStackType {
|
|||
#ifdef USE_COMBINATION_EXPLOSION_CHECK
|
||||
unsigned int state_check;
|
||||
#endif
|
||||
UChar *pkeep; /* keep pattern position */
|
||||
} state;
|
||||
struct {
|
||||
int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */
|
||||
|
@ -766,9 +828,10 @@ typedef struct {
|
|||
size_t stack_n;
|
||||
OnigOptionType options;
|
||||
OnigRegion* region;
|
||||
const UChar* start; /* search start position (for \G: BEGIN_POSITION) */
|
||||
const UChar* start; /* search start position */
|
||||
const UChar* gpos; /* global position (for \G: BEGIN_POSITION) */
|
||||
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
|
||||
int best_len; /* for ONIG_OPTION_FIND_LONGEST */
|
||||
OnigPosition best_len; /* for ONIG_OPTION_FIND_LONGEST */
|
||||
UChar* best_s;
|
||||
#endif
|
||||
#ifdef USE_COMBINATION_EXPLOSION_CHECK
|
||||
|
@ -799,7 +862,7 @@ extern void onig_print_statistics P_((FILE* f));
|
|||
#endif
|
||||
#endif
|
||||
|
||||
extern UChar* onig_error_code_to_format P_((int code));
|
||||
extern UChar* onig_error_code_to_format P_((OnigPosition code));
|
||||
extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...));
|
||||
extern int onig_bbuf_init P_((BBuf* buf, OnigDistance size));
|
||||
extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo, const char *sourcefile, int sourceline));
|
||||
|
@ -815,7 +878,7 @@ typedef void hash_table_type;
|
|||
#include "ruby/st.h"
|
||||
typedef st_data_t hash_data_type;
|
||||
#else
|
||||
typedef unsigned long hash_data_type;
|
||||
typedef uintptr_t hash_data_type;
|
||||
#endif
|
||||
|
||||
extern hash_table_type* onig_st_init_strend_table_with_size P_((st_index_t size));
|
||||
|
|
1055
regparse.c
1055
regparse.c
File diff suppressed because it is too large
Load diff
|
@ -1,10 +1,11 @@
|
|||
#ifndef ONIGURUMA_REGPARSE_H
|
||||
#define ONIGURUMA_REGPARSE_H
|
||||
/**********************************************************************
|
||||
regparse.h - Oniguruma (regular expression library)
|
||||
regparse.h - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -91,6 +92,7 @@
|
|||
#define ENCLOSE_MEMORY (1<<0)
|
||||
#define ENCLOSE_OPTION (1<<1)
|
||||
#define ENCLOSE_STOP_BACKTRACK (1<<2)
|
||||
#define ENCLOSE_CONDITION (1<<3)
|
||||
|
||||
#define NODE_STR_MARGIN 16
|
||||
#define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */
|
||||
|
@ -100,7 +102,7 @@
|
|||
#define NSTR_AMBIG (1<<1)
|
||||
#define NSTR_DONT_GET_OPT_INFO (1<<2)
|
||||
|
||||
#define NSTRING_LEN(node) (OnigDistance)((node)->u.str.end - (node)->u.str.s)
|
||||
#define NSTRING_LEN(node) (OnigDistance )((node)->u.str.end - (node)->u.str.s)
|
||||
#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW
|
||||
#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW
|
||||
#define NSTRING_SET_AMBIG(node) (node)->u.str.flag |= NSTR_AMBIG
|
||||
|
@ -150,6 +152,7 @@
|
|||
#define IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(en) \
|
||||
(((en)->state & NST_STOP_BT_SIMPLE_REPEAT) != 0)
|
||||
#define IS_ENCLOSE_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0)
|
||||
#define IS_ENCLOSE_NAME_REF(en) (((en)->state & NST_NAME_REF) != 0)
|
||||
|
||||
#define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION
|
||||
#define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0)
|
||||
|
@ -240,6 +243,7 @@ typedef struct {
|
|||
int type;
|
||||
struct _Node* target;
|
||||
int char_len;
|
||||
int ascii_range;
|
||||
} AnchorNode;
|
||||
|
||||
typedef struct {
|
||||
|
@ -252,6 +256,7 @@ typedef struct {
|
|||
NodeBase base;
|
||||
int ctype;
|
||||
int not;
|
||||
int ascii_range;
|
||||
} CtypeNode;
|
||||
|
||||
typedef struct _Node {
|
||||
|
|
84
regsyntax.c
84
regsyntax.c
|
@ -1,8 +1,9 @@
|
|||
/**********************************************************************
|
||||
regsyntax.c - Oniguruma (regular expression library)
|
||||
regsyntax.c - Onigmo (Oniguruma-mod) (regular expression library)
|
||||
**********************************************************************/
|
||||
/*-
|
||||
* Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
||||
* Copyright (c) 2011-2012 K.Takata <kentkt AT csc DOT jp>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -153,7 +154,8 @@ const OnigSyntaxType OnigSyntaxJava = {
|
|||
ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY )
|
||||
, ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND )
|
||||
, ONIG_OPTION_SINGLELINE
|
||||
, ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_ASCII_RANGE |
|
||||
ONIG_OPTION_WORD_BOUND_ALL_RANGE )
|
||||
,
|
||||
{
|
||||
(OnigCodePoint )'\\' /* esc */
|
||||
|
@ -165,7 +167,8 @@ const OnigSyntaxType OnigSyntaxJava = {
|
|||
}
|
||||
};
|
||||
|
||||
const OnigSyntaxType OnigSyntaxPerl = {
|
||||
/* Perl 5.8 */
|
||||
const OnigSyntaxType OnigSyntaxPerl58 = {
|
||||
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
|
||||
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
|
||||
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
|
||||
|
@ -174,7 +177,9 @@ const OnigSyntaxType OnigSyntaxPerl = {
|
|||
, ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE |
|
||||
ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT )
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
|
||||
ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
|
||||
ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)
|
||||
, SYN_GNU_REGEX_BV
|
||||
, ONIG_OPTION_SINGLELINE
|
||||
,
|
||||
|
@ -188,8 +193,8 @@ const OnigSyntaxType OnigSyntaxPerl = {
|
|||
}
|
||||
};
|
||||
|
||||
/* Perl + named group */
|
||||
const OnigSyntaxType OnigSyntaxPerl_NG = {
|
||||
/* Perl 5.8 + named group */
|
||||
const OnigSyntaxType OnigSyntaxPerl58_NG = {
|
||||
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
|
||||
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
|
||||
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
|
||||
|
@ -199,6 +204,8 @@ const OnigSyntaxType OnigSyntaxPerl_NG = {
|
|||
ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
|
||||
ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
|
||||
ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
|
||||
ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP |
|
||||
ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
|
||||
ONIG_SYN_OP2_ESC_G_SUBEXP_CALL )
|
||||
|
@ -217,6 +224,71 @@ const OnigSyntaxType OnigSyntaxPerl_NG = {
|
|||
}
|
||||
};
|
||||
|
||||
/* Perl 5.10+ */
|
||||
const OnigSyntaxType OnigSyntaxPerl = {
|
||||
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
|
||||
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
|
||||
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
|
||||
ONIG_SYN_OP_ESC_C_CONTROL )
|
||||
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
|
||||
, ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE |
|
||||
ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
|
||||
ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
|
||||
ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
|
||||
ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
|
||||
ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL |
|
||||
ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
|
||||
ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
|
||||
ONIG_SYN_OP2_QMARK_SUBEXP_CALL |
|
||||
ONIG_SYN_OP2_ESC_G_BRACE_BACKREF |
|
||||
ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP |
|
||||
ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP |
|
||||
ONIG_SYN_OP2_ESC_K_NAMED_BACKREF )
|
||||
, ( SYN_GNU_REGEX_BV |
|
||||
ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
|
||||
ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL )
|
||||
, ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_CAPTURE_GROUP )
|
||||
,
|
||||
{
|
||||
(OnigCodePoint )'\\' /* esc */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
|
||||
}
|
||||
};
|
||||
|
||||
const OnigSyntaxType OnigSyntaxPython = {
|
||||
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
|
||||
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
|
||||
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
|
||||
ONIG_SYN_OP_ESC_C_CONTROL )
|
||||
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
|
||||
, ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
|
||||
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
|
||||
ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
|
||||
ONIG_SYN_OP2_ESC_V_VTAB |
|
||||
ONIG_SYN_OP2_ESC_U_HEX4 |
|
||||
ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
|
||||
ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP )
|
||||
, ( SYN_GNU_REGEX_BV |
|
||||
ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV )
|
||||
, ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_ASCII_RANGE )
|
||||
,
|
||||
{
|
||||
(OnigCodePoint )'\\' /* esc */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
|
||||
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
extern int
|
||||
|
|
|
@ -5,7 +5,7 @@ require "test/unit"
|
|||
class TestEUC_JP < Test::Unit::TestCase
|
||||
def test_mbc_case_fold
|
||||
assert_match(/(£á)(a)\1\2/i, "£áa£áA")
|
||||
assert_no_match(/(£á)(a)\1\2/i, "£áa£ÁA")
|
||||
assert_match(/(£á)(a)\1\2/i, "£áa£ÁA")
|
||||
end
|
||||
|
||||
def test_property
|
||||
|
|
|
@ -5,7 +5,7 @@ require "test/unit"
|
|||
class TestShiftJIS < Test::Unit::TestCase
|
||||
def test_mbc_case_fold
|
||||
assert_match(/(‚<>)(a)\1\2/i, "‚<EFBFBD>a‚<EFBFBD>A")
|
||||
assert_no_match(/(‚<>)(a)\1\2/i, "‚<EFBFBD>a‚`A")
|
||||
assert_match(/(‚<>)(a)\1\2/i, "‚<EFBFBD>a‚`A")
|
||||
end
|
||||
|
||||
def test_property
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#
|
||||
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
|
||||
# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
|
||||
# and DerivedAge.txt from unicode.org.
|
||||
# DerivedAge.txt and Blocks.txt from unicode.org.
|
||||
# (http://unicode.org/Public/UNIDATA/) And run following command.
|
||||
# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
|
||||
# You can get source file for gperf. After this, simply make ruby.
|
||||
|
@ -90,7 +90,10 @@ def parse_unicode_data(file)
|
|||
# codepoints to Cn and C
|
||||
cn_remainder = (last_cp.next..0x10ffff).to_a
|
||||
data['Cn'] += cn_remainder
|
||||
data['C'] += cn_remainder
|
||||
data['C'] += data['Cn']
|
||||
|
||||
# Special case for LC (Cased_Letter). LC = Ll + Lt + Lu
|
||||
data['LC'] = data['Ll'] + data['Lt'] + data['Lu']
|
||||
|
||||
# Define General Category properties
|
||||
gcps = data.keys.sort - POSIX_NAMES
|
||||
|
@ -112,16 +115,15 @@ def define_posix_props(data)
|
|||
(0x0061..0x0066).to_a
|
||||
data['Alnum'] = data['Alpha'] + data['Digit']
|
||||
data['Space'] = data['White_Space']
|
||||
data['Blank'] = data['White_Space'] - [0x0A, 0x0B, 0x0C, 0x0D, 0x85] -
|
||||
data['Line_Separator'] - data['Paragraph_Separator']
|
||||
data['Blank'] = data['Space_Separator'] + [0x0009]
|
||||
data['Cntrl'] = data['Cc']
|
||||
data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
|
||||
data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
|
||||
data['Surrogate'] - data['Unassigned']
|
||||
data['Print'] = data['Graph'] + data['Blank'] - data['Cntrl']
|
||||
data['Print'] = data['Graph'] + data['Space_Separator']
|
||||
end
|
||||
|
||||
def parse_scripts(data)
|
||||
def parse_scripts(data, categories)
|
||||
files = [
|
||||
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
|
||||
{fn: 'Scripts.txt', title: 'Script'},
|
||||
|
@ -134,7 +136,7 @@ def parse_scripts(data)
|
|||
IO.foreach(get_file(file[:fn])) do |line|
|
||||
if /^# Total code points: / =~ line
|
||||
data[current] = cps
|
||||
make_const(current, cps, file[:title])
|
||||
categories[current] = file[:title]
|
||||
(names[file[:title]] ||= []) << current
|
||||
cps = []
|
||||
elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
|
||||
|
@ -146,7 +148,7 @@ def parse_scripts(data)
|
|||
# All code points not explicitly listed for Script
|
||||
# have the value Unknown (Zzzz).
|
||||
data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten
|
||||
make_const('Unknown', data['Unknown'], 'Script')
|
||||
categories['Unknown'] = 'Script'
|
||||
names.values.flatten << 'Unknown'
|
||||
end
|
||||
|
||||
|
@ -200,6 +202,29 @@ def parse_age(data)
|
|||
ages
|
||||
end
|
||||
|
||||
def parse_block(data)
|
||||
current = nil
|
||||
last_constname = nil
|
||||
cps = []
|
||||
blocks = []
|
||||
IO.foreach(get_file('Blocks.txt')) do |line|
|
||||
if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
|
||||
cps = ($1.to_i(16)..$2.to_i(16)).to_a
|
||||
constname = constantize_blockname($3)
|
||||
data[constname] = cps
|
||||
make_const(constname, cps, "Block")
|
||||
blocks << constname
|
||||
end
|
||||
end
|
||||
|
||||
# All code points not belonging to any of the named blocks
|
||||
# have the value No_Block.
|
||||
no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten
|
||||
constname = constantize_blockname("No_Block")
|
||||
make_const(constname, no_block, "Block")
|
||||
blocks << constname
|
||||
end
|
||||
|
||||
$const_cache = {}
|
||||
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
||||
# given property, group of paired codepoints, and a human-friendly name for
|
||||
|
@ -232,6 +257,10 @@ def constantize_agename(name)
|
|||
"Age_#{name.sub(/\./, '_')}"
|
||||
end
|
||||
|
||||
def constantize_blockname(name)
|
||||
"In_#{name.gsub(/\W/, '_')}"
|
||||
end
|
||||
|
||||
def get_file(name)
|
||||
File.join(ARGV[0], name)
|
||||
end
|
||||
|
@ -241,9 +270,16 @@ end
|
|||
puts '%{'
|
||||
puts '#define long size_t'
|
||||
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
|
||||
categories = {}
|
||||
props.concat parse_scripts(data, categories)
|
||||
aliases = parse_aliases(data)
|
||||
define_posix_props(data)
|
||||
POSIX_NAMES.each do |name|
|
||||
make_const(name, data[name], "[[:#{name}:]]")
|
||||
end
|
||||
print "\n#ifdef USE_UNICODE_PROPERTIES"
|
||||
props.each do |name|
|
||||
category =
|
||||
category = categories[name] ||
|
||||
case name.size
|
||||
when 1 then 'Major Category'
|
||||
when 2 then 'General Category'
|
||||
|
@ -251,22 +287,18 @@ props.each do |name|
|
|||
end
|
||||
make_const(name, data[name], category)
|
||||
end
|
||||
props.concat parse_scripts(data)
|
||||
puts '#endif /* USE_UNICODE_PROPERTIES */'
|
||||
aliases = parse_aliases(data)
|
||||
ages = parse_age(data)
|
||||
define_posix_props(data)
|
||||
POSIX_NAMES.each do |name|
|
||||
make_const(name, data[name], "[[:#{name}:]]")
|
||||
end
|
||||
blocks = parse_block(data)
|
||||
puts '#endif /* USE_UNICODE_PROPERTIES */'
|
||||
puts(<<'__HEREDOC')
|
||||
|
||||
static const OnigCodePoint* const CodeRanges[] = {
|
||||
__HEREDOC
|
||||
POSIX_NAMES.each{|name|puts" CR_#{name},"}
|
||||
puts "#ifdef USE_UNICODE_PROPERTIES"
|
||||
props.each{|name|puts" CR_#{name},"}
|
||||
ages.each{|name| puts" CR_#{constantize_agename(name)},"}
|
||||
props.each{|name| puts" CR_#{name},"}
|
||||
ages.each{|name| puts" CR_#{constantize_agename(name)},"}
|
||||
blocks.each{|name|puts" CR_#{name},"}
|
||||
|
||||
puts(<<'__HEREDOC')
|
||||
#endif /* USE_UNICODE_PROPERTIES */
|
||||
|
@ -284,6 +316,7 @@ i = -1
|
|||
name_to_index = {}
|
||||
POSIX_NAMES.each do |name|
|
||||
i += 1
|
||||
next if name == 'NEWLINE'
|
||||
name = normalize_propname(name)
|
||||
name_to_index[name] = i
|
||||
puts"%-40s %3d" % [name + ',', i]
|
||||
|
@ -306,6 +339,12 @@ ages.each do |name|
|
|||
name_to_index[name] = i
|
||||
puts "%-40s %3d" % [name + ',', i]
|
||||
end
|
||||
blocks.each do |name|
|
||||
i += 1
|
||||
name = normalize_propname(name)
|
||||
name_to_index[name] = i
|
||||
puts "%-40s %3d" % [name + ',', i]
|
||||
end
|
||||
puts(<<'__HEREDOC')
|
||||
#endif /* USE_UNICODE_PROPERTIES */
|
||||
%%
|
||||
|
|
Loading…
Reference in a new issue