mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* include/ruby/intern.h (rb_uv_to_utf8): declared.
* re.c (rb_reg_preprocess): new function for dynamic regexp with \u{} such as Regexp.new("\\u{6666}"). (rb_reg_prepare_re): preprocess regexp for recompiling. (read_escaped_byte): new function. (unescape_escaped_nonascii): new function. (append_utf8): new function. (unescape_unicode_list): new function. (unescape_unicode_bmp): new function. (unescape_nonascii): new function. (rb_reg_initialize): preprocess regexp. * pack.c (rb_uv_to_utf8): renamed from uv_to_utf8. * parse.y (STR_NEW3): take func instead of has8 and hasmb. (parser_str_new): use default coderange mechanism except for regexp. (parser_tokadd_utf8): copy regexp source as-is. (parser_read_escape): UTF-8 stuff removed. (parser_tokadd_escape): has8bit and hasmb removed. (parser_tokadd_string): fix 8-bit single byte character with \u. (parser_parse_string): has8bit and hasmb removed. (parser_here_document): has8bit and hasmb removed. (parser_yylex): call parser_tokadd_utf8 instead of read_escape for UTF-8 character. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14072 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
d92b461dd9
commit
7ff702406a
7 changed files with 637 additions and 176 deletions
28
ChangeLog
28
ChangeLog
|
@ -1,3 +1,31 @@
|
|||
Sun Dec 2 01:39:51 2007 Tanaka Akira <akr@fsij.org>
|
||||
|
||||
* include/ruby/intern.h (rb_uv_to_utf8): declared.
|
||||
|
||||
* re.c (rb_reg_preprocess): new function for dynamic regexp with
|
||||
\u{} such as Regexp.new("\\u{6666}").
|
||||
(rb_reg_prepare_re): preprocess regexp for recompiling.
|
||||
(read_escaped_byte): new function.
|
||||
(unescape_escaped_nonascii): new function.
|
||||
(append_utf8): new function.
|
||||
(unescape_unicode_list): new function.
|
||||
(unescape_unicode_bmp): new function.
|
||||
(unescape_nonascii): new function.
|
||||
(rb_reg_initialize): preprocess regexp.
|
||||
|
||||
* pack.c (rb_uv_to_utf8): renamed from uv_to_utf8.
|
||||
|
||||
* parse.y (STR_NEW3): take func instead of has8 and hasmb.
|
||||
(parser_str_new): use default coderange mechanism except for regexp.
|
||||
(parser_tokadd_utf8): copy regexp source as-is.
|
||||
(parser_read_escape): UTF-8 stuff removed.
|
||||
(parser_tokadd_escape): has8bit and hasmb removed.
|
||||
(parser_tokadd_string): fix 8-bit single byte character with \u.
|
||||
(parser_parse_string): has8bit and hasmb removed.
|
||||
(parser_here_document): has8bit and hasmb removed.
|
||||
(parser_yylex): call parser_tokadd_utf8 instead of read_escape for
|
||||
UTF-8 character.
|
||||
|
||||
Wed Dec 2 01:00:07 2007 James Edward Gray II <jeg2@ruby-lang.org>
|
||||
|
||||
* lib/xmlrpc/server.rb (XMLRPC::Server#server): Improve signal handling so
|
||||
|
|
|
@ -101,6 +101,7 @@ unsigned LONG_LONG rb_big2ull(VALUE);
|
|||
#endif /* HAVE_LONG_LONG */
|
||||
void rb_quad_pack(char*,VALUE);
|
||||
VALUE rb_quad_unpack(const char*,int);
|
||||
int rb_uv_to_utf8(char[6],unsigned long);
|
||||
VALUE rb_dbl2big(double);
|
||||
double rb_big2dbl(VALUE);
|
||||
VALUE rb_big_cmp(VALUE, VALUE);
|
||||
|
|
7
pack.c
7
pack.c
|
@ -365,7 +365,6 @@ static const char toofew[] = "too few arguments";
|
|||
static void encodes(VALUE,const char*,long,int);
|
||||
static void qpencode(VALUE,VALUE,long);
|
||||
|
||||
static int uv_to_utf8(char*,unsigned long);
|
||||
static unsigned long utf8_to_uv(const char*,long*);
|
||||
|
||||
/*
|
||||
|
@ -872,7 +871,7 @@ pack_pack(VALUE ary, VALUE fmt)
|
|||
if (l < 0) {
|
||||
rb_raise(rb_eRangeError, "pack(U): value out of range");
|
||||
}
|
||||
le = uv_to_utf8(buf, l);
|
||||
le = rb_uv_to_utf8(buf, l);
|
||||
rb_str_buf_cat(res, (char*)buf, le);
|
||||
}
|
||||
break;
|
||||
|
@ -1991,8 +1990,8 @@ pack_unpack(VALUE str, VALUE fmt)
|
|||
|
||||
#define BYTEWIDTH 8
|
||||
|
||||
static int
|
||||
uv_to_utf8(char *buf, unsigned long uv)
|
||||
int
|
||||
rb_uv_to_utf8(char buf[6], unsigned long uv)
|
||||
{
|
||||
if (uv <= 0x7f) {
|
||||
buf[0] = (char)uv;
|
||||
|
|
213
parse.y
213
parse.y
|
@ -269,7 +269,7 @@ struct parser_params {
|
|||
#define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc)
|
||||
#define STR_NEW0() rb_str_new(0,0)
|
||||
#define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc)
|
||||
#define STR_NEW3(p,n,e,has8,hasmb) parser_str_new2((p),(n),(e),(has8),(hasmb))
|
||||
#define STR_NEW3(p,n,e,func) parser_str_new((p),(n),(e),(func))
|
||||
#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0))
|
||||
#define ENC_SINGLE(cr) ((cr)==ENC_CODERANGE_7BIT)
|
||||
#define TOK_INTERN(mb) rb_intern3(tok(), toklen(), STR_ENC(mb))
|
||||
|
@ -4488,7 +4488,7 @@ none : /* none */
|
|||
# define yylval (*((YYSTYPE*)(parser->parser_yylval)))
|
||||
|
||||
static int parser_regx_options(struct parser_params*);
|
||||
static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*,int*,rb_encoding**);
|
||||
static int parser_tokadd_string(struct parser_params*,int,int,int,long*,rb_encoding**);
|
||||
static void parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc);
|
||||
static int parser_parse_string(struct parser_params*,NODE*);
|
||||
static int parser_here_document(struct parser_params*,NODE*);
|
||||
|
@ -4500,11 +4500,10 @@ static int parser_here_document(struct parser_params*,NODE*);
|
|||
# define tokspace(n) parser_tokspace(parser, n)
|
||||
# define tokadd(c) parser_tokadd(parser, c)
|
||||
# define tok_hex(numlen) parser_tok_hex(parser, numlen)
|
||||
# define tok_utf8(numlen,e) parser_tok_utf8(parser, numlen, e)
|
||||
# define read_escape(flags,has8,hasmb,e) parser_read_escape(parser, flags, has8, hasmb, e)
|
||||
# define tokadd_escape(t,has8,hasmb,e) parser_tokadd_escape(parser, t, has8,hasmb, e)
|
||||
# define read_escape(flags,e) parser_read_escape(parser, flags, e)
|
||||
# define tokadd_escape(t,e) parser_tokadd_escape(parser, t, e)
|
||||
# define regx_options() parser_regx_options(parser)
|
||||
# define tokadd_string(f,t,p,n,has8bit,hasmb,e) parser_tokadd_string(parser,f,t,p,n,has8bit,hasmb,e)
|
||||
# define tokadd_string(f,t,p,n,e) parser_tokadd_string(parser,f,t,p,n,e)
|
||||
# define parse_string(n) parser_parse_string(parser,n)
|
||||
# define tokaddmbc(c, enc) parser_tokaddmbc(parser, c, enc)
|
||||
# define here_document(n) parser_here_document(parser,n)
|
||||
|
@ -4821,37 +4820,39 @@ rb_parser_compile_file(volatile VALUE vparser, const char *f, VALUE file, int st
|
|||
}
|
||||
#endif /* !RIPPER */
|
||||
|
||||
#define STR_FUNC_ESCAPE 0x01
|
||||
#define STR_FUNC_EXPAND 0x02
|
||||
#define STR_FUNC_REGEXP 0x04
|
||||
#define STR_FUNC_QWORDS 0x08
|
||||
#define STR_FUNC_SYMBOL 0x10
|
||||
#define STR_FUNC_INDENT 0x20
|
||||
|
||||
enum string_type {
|
||||
str_squote = (0),
|
||||
str_dquote = (STR_FUNC_EXPAND),
|
||||
str_xquote = (STR_FUNC_EXPAND),
|
||||
str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND),
|
||||
str_sword = (STR_FUNC_QWORDS),
|
||||
str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND),
|
||||
str_ssym = (STR_FUNC_SYMBOL),
|
||||
str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND),
|
||||
};
|
||||
|
||||
static VALUE
|
||||
parser_str_new(const char *p, long n, rb_encoding *enc, int coderange)
|
||||
parser_str_new(const char *p, long n, rb_encoding *enc, int func)
|
||||
{
|
||||
VALUE str = rb_enc_str_new(p, n, enc);
|
||||
ENC_CODERANGE_SET(str, coderange);
|
||||
VALUE str;
|
||||
|
||||
str = rb_enc_str_new(p, n, enc);
|
||||
if (!(func & STR_FUNC_REGEXP) &&
|
||||
rb_enc_asciicompat(enc) &&
|
||||
rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
|
||||
rb_enc_associate(str, rb_default_encoding());
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
parser_str_new2(const char *p, long n, rb_encoding *enc, int has8bit,int hasmb)
|
||||
{
|
||||
/*
|
||||
* Set coderange bit flags based on the presence of 8-bit and
|
||||
* multi-byte characters in the string
|
||||
*/
|
||||
int coderange = ENC_CODERANGE_7BIT;
|
||||
if (hasmb) coderange = ENC_CODERANGE_8BIT;
|
||||
else if (has8bit) coderange = ENC_CODERANGE_UNKNOWN;
|
||||
|
||||
/*
|
||||
* If it is all single byte characters with the 8th bit clear,
|
||||
* and if the specified encoding is ASCII-compatible, then this
|
||||
* string is in the ASCII subset, and we just use the ASCII encoding
|
||||
* instead.
|
||||
*/
|
||||
if ((coderange == ENC_CODERANGE_7BIT) && rb_enc_asciicompat(enc))
|
||||
enc = rb_default_encoding();
|
||||
|
||||
return parser_str_new(p, n, enc, coderange);
|
||||
}
|
||||
|
||||
static inline int
|
||||
parser_nextc(struct parser_params *parser)
|
||||
{
|
||||
|
@ -4979,9 +4980,11 @@ parser_tok_hex(struct parser_params *parser, int *numlen)
|
|||
return c;
|
||||
}
|
||||
|
||||
#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
|
||||
|
||||
static int
|
||||
parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
|
||||
rb_encoding **encp, int string_literal, int symbol_literal)
|
||||
parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
|
||||
int string_literal, int symbol_literal, int regexp_literal)
|
||||
{
|
||||
/*
|
||||
* If string_literal is true, then we allow multiple codepoints
|
||||
|
@ -4993,8 +4996,11 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
|
|||
int codepoint;
|
||||
int numlen;
|
||||
|
||||
if (regexp_literal) { tokadd('\\'); tokadd('u'); }
|
||||
|
||||
if (peek('{')) { /* handle \u{...} form */
|
||||
do {
|
||||
if (regexp_literal) { tokadd(*lex_p); }
|
||||
nextc();
|
||||
codepoint = scan_hex(lex_p, 6, &numlen);
|
||||
if (numlen == 0) {
|
||||
|
@ -5006,8 +5012,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
|
|||
return 0;
|
||||
}
|
||||
lex_p += numlen;
|
||||
if (codepoint >= 0x80) {
|
||||
*hasmb = 1;
|
||||
if (regexp_literal) {
|
||||
tokcopy(numlen);
|
||||
}
|
||||
else if (codepoint >= 0x80) {
|
||||
*encp = UTF8_ENC();
|
||||
if (string_literal) tokaddmbc(codepoint, *encp);
|
||||
}
|
||||
|
@ -5026,6 +5034,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (regexp_literal) { tokadd('}'); }
|
||||
nextc();
|
||||
}
|
||||
else { /* handle \uxxxx form */
|
||||
|
@ -5035,8 +5044,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
|
|||
return 0;
|
||||
}
|
||||
lex_p += 4;
|
||||
if (codepoint >= 0x80) {
|
||||
*hasmb = 1;
|
||||
if (regexp_literal) {
|
||||
tokcopy(4);
|
||||
}
|
||||
else if (codepoint >= 0x80) {
|
||||
*encp = UTF8_ENC();
|
||||
if (string_literal) tokaddmbc(codepoint, *encp);
|
||||
}
|
||||
|
@ -5058,7 +5069,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
|
|||
|
||||
static int
|
||||
parser_read_escape(struct parser_params *parser, int flags,
|
||||
int *has8bit, int *hasmb, rb_encoding **encp)
|
||||
rb_encoding **encp)
|
||||
{
|
||||
int c;
|
||||
int numlen;
|
||||
|
@ -5098,19 +5109,12 @@ parser_read_escape(struct parser_params *parser, int flags,
|
|||
c = scan_oct(lex_p, 3, &numlen);
|
||||
lex_p += numlen;
|
||||
}
|
||||
if (c >= 0200) *has8bit = 1;
|
||||
return c;
|
||||
|
||||
case 'x': /* hex constant */
|
||||
if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof;
|
||||
c = tok_hex(&numlen);
|
||||
if (numlen == 0) return 0;
|
||||
if (c >= 0x80) *has8bit = 1;
|
||||
return c;
|
||||
|
||||
case 'u': /* unicode constant: here only for char literal */
|
||||
if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof;
|
||||
c = parser_tokadd_utf8(parser, hasmb, encp, 0, 0);
|
||||
return c;
|
||||
|
||||
case 'b': /* backspace */
|
||||
|
@ -5126,13 +5130,10 @@ parser_read_escape(struct parser_params *parser, int flags,
|
|||
goto eof;
|
||||
}
|
||||
if ((c = nextc()) == '\\') {
|
||||
int tmp;
|
||||
*has8bit = 1;
|
||||
return read_escape(flags|ESCAPE_META, &tmp, &tmp, encp) | 0x80;
|
||||
return read_escape(flags|ESCAPE_META, encp) | 0x80;
|
||||
}
|
||||
else if (c == -1 || !ISASCII(c)) goto eof;
|
||||
else {
|
||||
*has8bit = 1;
|
||||
return ((c & 0xff) | 0x80);
|
||||
}
|
||||
|
||||
|
@ -5144,8 +5145,7 @@ parser_read_escape(struct parser_params *parser, int flags,
|
|||
case 'c':
|
||||
if (flags & ESCAPE_CONTROL) goto eof;
|
||||
if ((c = nextc())== '\\') {
|
||||
int tmp;
|
||||
c = read_escape(flags|ESCAPE_CONTROL, has8bit, &tmp, encp);
|
||||
c = read_escape(flags|ESCAPE_CONTROL, encp);
|
||||
}
|
||||
else if (c == '?')
|
||||
return 0177;
|
||||
|
@ -5162,8 +5162,6 @@ parser_read_escape(struct parser_params *parser, int flags,
|
|||
}
|
||||
}
|
||||
|
||||
#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
|
||||
|
||||
static void
|
||||
parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc)
|
||||
{
|
||||
|
@ -5173,7 +5171,7 @@ parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc)
|
|||
|
||||
static int
|
||||
parser_tokadd_escape(struct parser_params *parser, int term,
|
||||
int *has8bit, int *hasmb, rb_encoding **encp)
|
||||
rb_encoding **encp)
|
||||
{
|
||||
int c;
|
||||
int flags = 0;
|
||||
|
@ -5194,7 +5192,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
|
|||
if (numlen == 0) goto eof;
|
||||
lex_p += numlen;
|
||||
tokcopy(numlen + 1);
|
||||
if (oct >= 0200) *has8bit = 1;
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
@ -5207,7 +5204,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
|
|||
hex = tok_hex(&numlen);
|
||||
if (numlen == 0) goto eof;
|
||||
tokcopy(numlen + 2);
|
||||
if (hex >= 0x80) *has8bit = 1;
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
@ -5218,7 +5214,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
|
|||
goto eof;
|
||||
}
|
||||
tokcopy(3);
|
||||
*has8bit = 1;
|
||||
flags |= ESCAPE_META;
|
||||
goto escaped;
|
||||
|
||||
|
@ -5287,24 +5282,6 @@ parser_regx_options(struct parser_params *parser)
|
|||
return options | RE_OPTION_ENCODING(kcode);
|
||||
}
|
||||
|
||||
#define STR_FUNC_ESCAPE 0x01
|
||||
#define STR_FUNC_EXPAND 0x02
|
||||
#define STR_FUNC_REGEXP 0x04
|
||||
#define STR_FUNC_QWORDS 0x08
|
||||
#define STR_FUNC_SYMBOL 0x10
|
||||
#define STR_FUNC_INDENT 0x20
|
||||
|
||||
enum string_type {
|
||||
str_squote = (0),
|
||||
str_dquote = (STR_FUNC_EXPAND),
|
||||
str_xquote = (STR_FUNC_EXPAND),
|
||||
str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND),
|
||||
str_sword = (STR_FUNC_QWORDS),
|
||||
str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND),
|
||||
str_ssym = (STR_FUNC_SYMBOL),
|
||||
str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND),
|
||||
};
|
||||
|
||||
static void
|
||||
dispose_string(VALUE str)
|
||||
{
|
||||
|
@ -5328,10 +5305,10 @@ parser_tokadd_mbchar(struct parser_params *parser, int c)
|
|||
static int
|
||||
parser_tokadd_string(struct parser_params *parser,
|
||||
int func, int term, int paren, long *nest,
|
||||
int *has8bit, int *hasmb, rb_encoding **encp)
|
||||
rb_encoding **encp)
|
||||
{
|
||||
int c;
|
||||
int has_mb = 0;
|
||||
int has_nonascii = 0;
|
||||
rb_encoding *enc = *encp;
|
||||
char *errbuf = 0;
|
||||
static const char mixed_msg[] = "%s mixed within %s source";
|
||||
|
@ -5390,9 +5367,10 @@ parser_tokadd_string(struct parser_params *parser,
|
|||
tokadd('\\');
|
||||
break;
|
||||
}
|
||||
parser_tokadd_utf8(parser, hasmb, &enc, 1,
|
||||
func & STR_FUNC_SYMBOL);
|
||||
if (has_mb && enc != *encp) {
|
||||
parser_tokadd_utf8(parser, &enc, 1,
|
||||
func & STR_FUNC_SYMBOL,
|
||||
func & STR_FUNC_REGEXP);
|
||||
if (has_nonascii && enc != *encp) {
|
||||
mixed_escape(beg, enc, *encp);
|
||||
}
|
||||
continue;
|
||||
|
@ -5400,28 +5378,17 @@ parser_tokadd_string(struct parser_params *parser,
|
|||
default:
|
||||
if (func & STR_FUNC_REGEXP) {
|
||||
pushback(c);
|
||||
if ((c = tokadd_escape(term, has8bit, hasmb, &enc)) < 0)
|
||||
if ((c = tokadd_escape(term, &enc)) < 0)
|
||||
return -1;
|
||||
if (has_mb && enc != *encp) {
|
||||
if (has_nonascii && enc != *encp) {
|
||||
mixed_escape(beg, enc, *encp);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else if (func & STR_FUNC_EXPAND) {
|
||||
int tmb = 0;
|
||||
pushback(c);
|
||||
if (func & STR_FUNC_ESCAPE) tokadd('\\');
|
||||
c = read_escape(0, has8bit, &tmb, &enc);
|
||||
if (tmb) {
|
||||
*hasmb = tmb;
|
||||
if (has_mb && enc != *encp) {
|
||||
mixed_escape(beg, enc, *encp);
|
||||
}
|
||||
else {
|
||||
tokaddmbc(c, enc);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
c = read_escape(0, &enc);
|
||||
}
|
||||
else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
|
||||
/* ignore backslashed spaces in %w */
|
||||
|
@ -5432,13 +5399,12 @@ parser_tokadd_string(struct parser_params *parser,
|
|||
}
|
||||
}
|
||||
else if (parser_ismbchar()) {
|
||||
has_mb = 1;
|
||||
has_nonascii = 1;
|
||||
if (enc != *encp) {
|
||||
mixed_error(enc, *encp);
|
||||
continue;
|
||||
}
|
||||
tokadd_mbchar(c);
|
||||
if (hasmb) *hasmb = 1;
|
||||
continue;
|
||||
}
|
||||
else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
|
||||
|
@ -5450,6 +5416,13 @@ parser_tokadd_string(struct parser_params *parser,
|
|||
compile_error(PARSER_ARG "symbol cannot contain '\\0'");
|
||||
continue;
|
||||
}
|
||||
if (c & 0x80) {
|
||||
has_nonascii = 1;
|
||||
if (enc != *encp) {
|
||||
mixed_error(enc, *encp);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
tokadd(c);
|
||||
}
|
||||
*encp = enc;
|
||||
|
@ -5465,7 +5438,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
|
|||
int func = quote->nd_func;
|
||||
int term = nd_term(quote);
|
||||
int paren = nd_paren(quote);
|
||||
int c, space = 0, has8bit=0, hasmb=0;
|
||||
int c, space = 0;
|
||||
rb_encoding *enc = parser->enc;
|
||||
|
||||
if (func == -1) return tSTRING_END;
|
||||
|
@ -5501,7 +5474,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
|
|||
}
|
||||
pushback(c);
|
||||
if (tokadd_string(func, term, paren, "e->nd_nest,
|
||||
&has8bit, &hasmb, &enc) == -1) {
|
||||
&enc) == -1) {
|
||||
ruby_sourceline = nd_line(quote);
|
||||
if (func & STR_FUNC_REGEXP) {
|
||||
compile_error(PARSER_ARG "unterminated regexp meets end of file");
|
||||
|
@ -5514,7 +5487,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
|
|||
}
|
||||
|
||||
tokfix();
|
||||
set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb));
|
||||
set_yylval_str(STR_NEW3(tok(), toklen(), enc, func));
|
||||
return tSTRING_CONTENT;
|
||||
}
|
||||
|
||||
|
@ -5678,7 +5651,6 @@ parser_here_document(struct parser_params *parser, NODE *here)
|
|||
}
|
||||
else {
|
||||
/* int mb = ENC_CODERANGE_7BIT, *mbp = &mb;*/
|
||||
int has8bit=0, hasmb=0;
|
||||
rb_encoding *enc = parser->enc;
|
||||
newtok();
|
||||
if (c == '#') {
|
||||
|
@ -5695,16 +5667,16 @@ parser_here_document(struct parser_params *parser, NODE *here)
|
|||
do {
|
||||
pushback(c);
|
||||
if ((c = tokadd_string(func, '\n', 0, NULL,
|
||||
&has8bit, &hasmb, &enc)) == -1) goto error;
|
||||
&enc)) == -1) goto error;
|
||||
if (c != '\n') {
|
||||
set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit,hasmb));
|
||||
set_yylval_str(STR_NEW3(tok(), toklen(), enc, func));
|
||||
return tSTRING_CONTENT;
|
||||
}
|
||||
tokadd(nextc());
|
||||
/* if (mbp && mb == ENC_CODERANGE_UNKNOWN) mbp = 0;*/
|
||||
if ((c = nextc()) == -1) goto error;
|
||||
} while (!whole_match_p(eos, len, indent));
|
||||
str = STR_NEW3(tok(), toklen(), enc, has8bit,hasmb);
|
||||
str = STR_NEW3(tok(), toklen(), enc, func);
|
||||
}
|
||||
heredoc_restore(lex_strterm);
|
||||
lex_strterm = NEW_STRTERM(-1, 0, 0);
|
||||
|
@ -5966,7 +5938,6 @@ parser_yylex(struct parser_params *parser)
|
|||
int cmd_state;
|
||||
enum lex_state_e last_state;
|
||||
rb_encoding *enc;
|
||||
int has8bit = 0, hasmb = 0;
|
||||
int mb;
|
||||
#ifdef RIPPER
|
||||
int fallthru = Qfalse;
|
||||
|
@ -6317,26 +6288,33 @@ parser_yylex(struct parser_params *parser)
|
|||
newtok();
|
||||
enc = parser->enc;
|
||||
if (parser_ismbchar()) {
|
||||
hasmb = 1;
|
||||
tokadd_mbchar(c);
|
||||
}
|
||||
else if ((rb_enc_isalnum(c, parser->enc) || c == '_') &&
|
||||
lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) {
|
||||
goto ternary;
|
||||
}
|
||||
else if (c == '\\' && (c = read_escape(0, &has8bit, &hasmb, &enc)) >= 0x80) {
|
||||
if (hasmb) {
|
||||
tokaddmbc(c, enc);
|
||||
}
|
||||
else {
|
||||
tokadd(c);
|
||||
}
|
||||
}
|
||||
else {
|
||||
else if (c == '\\') {
|
||||
if (peek('u')) {
|
||||
nextc();
|
||||
c = parser_tokadd_utf8(parser, &enc, 0, 0, 0);
|
||||
if (0x80 <= c) {
|
||||
tokaddmbc(c, enc);
|
||||
}
|
||||
else {
|
||||
tokadd(c);
|
||||
}
|
||||
}
|
||||
else {
|
||||
c = read_escape(0, &enc);
|
||||
tokadd(c);
|
||||
}
|
||||
}
|
||||
else {
|
||||
tokadd(c);
|
||||
}
|
||||
}
|
||||
tokfix();
|
||||
set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb));
|
||||
set_yylval_str(STR_NEW3(tok(), toklen(), enc, 0));
|
||||
lex_state = EXPR_ENDARG;
|
||||
return tCHAR;
|
||||
|
||||
|
@ -8481,7 +8459,6 @@ reg_compile_gen(struct parser_params* parser, VALUE str, int options)
|
|||
compile_error(PARSER_ARG "%s", RSTRING_PTR(re));
|
||||
return Qnil;
|
||||
}
|
||||
if (str) rb_enc_copy(re, str);
|
||||
return re;
|
||||
}
|
||||
|
||||
|
|
446
re.c
446
re.c
|
@ -12,6 +12,7 @@
|
|||
#include "ruby/ruby.h"
|
||||
#include "ruby/re.h"
|
||||
#include "ruby/encoding.h"
|
||||
#include "ruby/util.h"
|
||||
#include "regint.h"
|
||||
#include <ctype.h>
|
||||
|
||||
|
@ -715,6 +716,10 @@ rb_reg_fixed_encoding_p(VALUE re)
|
|||
return Qfalse;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err);
|
||||
|
||||
static void
|
||||
rb_reg_prepare_re(VALUE re, VALUE str)
|
||||
{
|
||||
|
@ -740,13 +745,19 @@ rb_reg_prepare_re(VALUE re, VALUE str)
|
|||
OnigErrorInfo einfo;
|
||||
regex_t *reg, *reg2;
|
||||
UChar *pattern;
|
||||
VALUE unescaped;
|
||||
rb_encoding *fixed_enc = 0;
|
||||
|
||||
rb_reg_check(re);
|
||||
reg = RREGEXP(re)->ptr;
|
||||
pattern = ((UChar*)RREGEXP(re)->str);
|
||||
|
||||
r = onig_new(®2, (UChar* )pattern,
|
||||
(UChar* )(pattern + RREGEXP(re)->len),
|
||||
unescaped = rb_reg_preprocess(
|
||||
RREGEXP(re)->str, RREGEXP(re)->str + RREGEXP(re)->len, enc,
|
||||
&fixed_enc, err);
|
||||
|
||||
r = onig_new(®2, (UChar* )RSTRING_PTR(unescaped),
|
||||
(UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
|
||||
reg->options, enc,
|
||||
OnigDefaultSyntax, &einfo);
|
||||
if (r) {
|
||||
|
@ -756,6 +767,7 @@ rb_reg_prepare_re(VALUE re, VALUE str)
|
|||
|
||||
RREGEXP(re)->ptr = reg2;
|
||||
onig_free(reg);
|
||||
RB_GC_GUARD(unescaped);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1235,13 +1247,408 @@ match_inspect(VALUE match)
|
|||
|
||||
VALUE rb_cRegexp;
|
||||
|
||||
static int
|
||||
read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
|
||||
{
|
||||
const char *p = *pp;
|
||||
int code;
|
||||
int meta_prefix = 0, ctrl_prefix = 0;
|
||||
int len;
|
||||
int retbyte;
|
||||
|
||||
retbyte = -1;
|
||||
if (p == end || *p++ != '\\') {
|
||||
strcpy(err, "too short escaped multibyte character");
|
||||
return -1;
|
||||
}
|
||||
|
||||
again:
|
||||
if (p == end) {
|
||||
strcpy(err, "too short escape sequence");
|
||||
return -1;
|
||||
}
|
||||
switch (*p++) {
|
||||
case '\\': code = '\\'; break;
|
||||
case 'n': code = '\n'; break;
|
||||
case 't': code = '\t'; break;
|
||||
case 'r': code = '\r'; break;
|
||||
case 'f': code = '\f'; break;
|
||||
case 'v': code = '\013'; break;
|
||||
case 'a': code = '\007'; break;
|
||||
case 'e': code = '\033'; break;
|
||||
|
||||
/* \OOO */
|
||||
case '0': case '1': case '2': case '3':
|
||||
case '4': case '5': case '6': case '7':
|
||||
p--;
|
||||
code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
|
||||
p += len;
|
||||
break;
|
||||
|
||||
case 'x': /* \xHH */
|
||||
code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
|
||||
if (len < 1) {
|
||||
strcpy(err, "invalid hex escape");
|
||||
return -1;
|
||||
}
|
||||
p += len;
|
||||
break;
|
||||
|
||||
case 'M': /* \M-X, \M-\C-X, \M-\cX */
|
||||
if (meta_prefix) {
|
||||
strcpy(err, "duplicate meta escape");
|
||||
return -1;
|
||||
}
|
||||
meta_prefix = 1;
|
||||
if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
|
||||
if (*p == '\\') {
|
||||
p++;
|
||||
goto again;
|
||||
}
|
||||
else {
|
||||
code = *p++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
strcpy(err, "too short meta escape");
|
||||
return -1;
|
||||
|
||||
case 'C': /* \C-X, \C-\M-X */
|
||||
if (p == end || *p++ != '-') {
|
||||
strcpy(err, "too short control escape");
|
||||
return -1;
|
||||
}
|
||||
case 'c': /* \cX, \c\M-X */
|
||||
if (ctrl_prefix) {
|
||||
strcpy(err, "duplicate control escape");
|
||||
return -1;
|
||||
}
|
||||
ctrl_prefix = 1;
|
||||
if (p < end && (*p & 0x80) == 0) {
|
||||
if (*p == '\\') {
|
||||
p++;
|
||||
goto again;
|
||||
}
|
||||
else {
|
||||
code = *p++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
strcpy(err, "too short control escape");
|
||||
return -1;
|
||||
|
||||
default:
|
||||
strcpy(err, "unexpected escape sequence");
|
||||
return -1;
|
||||
}
|
||||
if (code < 0 || 0xff < code) {
|
||||
strcpy(err, "invalid escape code");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ctrl_prefix)
|
||||
code &= 0x1f;
|
||||
if (meta_prefix)
|
||||
code |= 0x80;
|
||||
|
||||
*pp = p;
|
||||
return code;
|
||||
}
|
||||
|
||||
static int
|
||||
unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
|
||||
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
|
||||
{
|
||||
const char *p = *pp;
|
||||
int chmaxlen = rb_enc_mbmaxlen(enc);
|
||||
char *chbuf = ALLOCA_N(char, chmaxlen);
|
||||
int chlen = 0;
|
||||
int byte;
|
||||
|
||||
memset(chbuf, 0, chmaxlen);
|
||||
|
||||
byte = read_escaped_byte(&p, end, err);
|
||||
if (byte == -1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
chbuf[chlen++] = byte;
|
||||
while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
|
||||
byte = read_escaped_byte(&p, end, err);
|
||||
if (byte == -1) {
|
||||
return -1;
|
||||
}
|
||||
chbuf[chlen++] = byte;
|
||||
}
|
||||
|
||||
if (chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
|
||||
strcpy(err, "invalid multibyte escape");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (1 < chlen || (chbuf[0] & 0x80)) {
|
||||
rb_str_buf_cat(buf, chbuf, chlen);
|
||||
|
||||
if (*encp == 0)
|
||||
*encp = enc;
|
||||
else if (*encp != enc) {
|
||||
strcpy(err, "character encodings differ");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
char escbuf[5];
|
||||
snprintf(escbuf, sizeof(escbuf), "\\x%02x", chbuf[0]&0xff);
|
||||
rb_str_buf_cat(buf, escbuf, 4);
|
||||
}
|
||||
*pp = p;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
append_utf8(unsigned long uv,
|
||||
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
|
||||
{
|
||||
if (uv < 0x80) {
|
||||
char escbuf[5];
|
||||
snprintf(escbuf, sizeof(escbuf), "\\x%02x", (int)uv);
|
||||
rb_str_buf_cat(buf, escbuf, 4);
|
||||
}
|
||||
else {
|
||||
int len;
|
||||
char utf8buf[6];
|
||||
len = rb_uv_to_utf8(utf8buf, uv);
|
||||
rb_str_buf_cat(buf, utf8buf, len);
|
||||
|
||||
if (*encp == 0)
|
||||
*encp = rb_enc_find("utf-8");
|
||||
else if (*encp != rb_enc_find("utf-8")) {
|
||||
strcpy(err, "character encodings differ");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
unescape_unicode_list(const char **pp, const char *end,
|
||||
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
|
||||
{
|
||||
const char *p = *pp;
|
||||
int has_unicode = 0;
|
||||
unsigned long code;
|
||||
int len;
|
||||
|
||||
while (p < end && ISSPACE(*p)) p++;
|
||||
|
||||
while (1) {
|
||||
code = ruby_scan_hex(p, end-p, &len);
|
||||
if (len == 0)
|
||||
break;
|
||||
if (6 < len) { /* max 10FFFF */
|
||||
strcpy(err, "invalid unicode range");
|
||||
return -1;
|
||||
}
|
||||
if (0x10ffff < code) {
|
||||
strcpy(err, "invalid unicode range");
|
||||
return -1;
|
||||
}
|
||||
p += len;
|
||||
if (append_utf8(code, buf, encp, err) != 0)
|
||||
return -1;
|
||||
has_unicode = 1;
|
||||
|
||||
while (p < end && ISSPACE(*p)) p++;
|
||||
}
|
||||
|
||||
if (has_unicode == 0) {
|
||||
strcpy(err, "invalid unicode list");
|
||||
return -1;
|
||||
}
|
||||
|
||||
*pp = p;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
unescape_unicode_bmp(const char **pp, const char *end,
|
||||
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
|
||||
{
|
||||
const char *p = *pp;
|
||||
int len;
|
||||
unsigned long code;
|
||||
|
||||
if (end < p+4) {
|
||||
strcpy(err, "invalid unicode escape");
|
||||
return -1;
|
||||
}
|
||||
code = ruby_scan_hex(p, 4, &len);
|
||||
if (len != 4) {
|
||||
strcpy(err, "invalid unicode escape");
|
||||
return -1;
|
||||
}
|
||||
if (append_utf8(code, buf, encp, err) != 0)
|
||||
return -1;
|
||||
*pp = p + 4;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
|
||||
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
|
||||
{
|
||||
char c;
|
||||
char smallbuf[2];
|
||||
|
||||
while (p < end) {
|
||||
int chlen = mbclen(p, end, enc);
|
||||
if (1 < chlen || (*p & 0x80)) {
|
||||
if (end < p + chlen) {
|
||||
strcpy(err, "too short multibyte character");
|
||||
return -1;
|
||||
}
|
||||
/* xxx: validate the non-ascii character */
|
||||
rb_str_buf_cat(buf, p, chlen);
|
||||
p += chlen;
|
||||
if (*encp == 0)
|
||||
*encp = enc;
|
||||
else if (*encp != enc) {
|
||||
strcpy(err, "character encodings differ");
|
||||
return -1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (c = *p++) {
|
||||
case '\\':
|
||||
if (p == end) {
|
||||
strcpy(err, "too short escape sequence");
|
||||
return -1;
|
||||
}
|
||||
switch (c = *p++) {
|
||||
case '1': case '2': case '3':
|
||||
case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
|
||||
{
|
||||
int octlen;
|
||||
if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
|
||||
/* backref or 7bit octal.
|
||||
no need to unescape anyway.
|
||||
re-escaping may break backref */
|
||||
goto escape_asis;
|
||||
}
|
||||
}
|
||||
/* xxx: How about more than 199 subexpressions? */
|
||||
|
||||
case '0': /* \0, \0O, \0OO */
|
||||
|
||||
case 'x': /* \xHH */
|
||||
case 'c': /* \cX, \c\M-X */
|
||||
case 'C': /* \C-X, \C-\M-X */
|
||||
case 'M': /* \M-X, \M-\C-X, \M-\cX */
|
||||
p = p-2;
|
||||
if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
|
||||
return -1;
|
||||
break;
|
||||
|
||||
case 'u':
|
||||
if (p == end) {
|
||||
strcpy(err, "too short escape sequence");
|
||||
return -1;
|
||||
}
|
||||
if (*p == '{') {
|
||||
/* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
|
||||
p++;
|
||||
if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
|
||||
return -1;
|
||||
if (p == end || *p++ != '}') {
|
||||
strcpy(err, "invalid unicode list");
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
else {
|
||||
/* \uHHHH */
|
||||
if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
|
||||
return -1;
|
||||
break;
|
||||
}
|
||||
|
||||
default: /* \n, \\, \d, \9, etc. */
|
||||
escape_asis:
|
||||
smallbuf[0] = '\\';
|
||||
smallbuf[1] = c;
|
||||
rb_str_buf_cat(buf, smallbuf, 2);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
rb_str_buf_cat(buf, &c, 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err)
|
||||
{
|
||||
VALUE buf;
|
||||
|
||||
buf = rb_str_buf_new(0);
|
||||
|
||||
*fixed_enc = 0;
|
||||
if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
|
||||
return Qnil;
|
||||
|
||||
if (fixed_enc) {
|
||||
rb_enc_associate(buf, *fixed_enc);
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static VALUE
|
||||
rb_reg_preprocess_obj(VALUE str,
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err)
|
||||
{
|
||||
VALUE buf;
|
||||
char *p, *end;
|
||||
rb_encoding *enc;
|
||||
|
||||
StringValue(str);
|
||||
p = RSTRING_PTR(str);
|
||||
end = p + RSTRING_LEN(str);
|
||||
enc = rb_enc_get(str);
|
||||
|
||||
buf = rb_reg_preprocess(p, end, enc, fixed_enc, err);
|
||||
RB_GC_GUARD(str);
|
||||
return buf;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
rb_reg_preprocess_m(VALUE klass, VALUE obj)
|
||||
{
|
||||
rb_encoding *fixed_enc = 0;
|
||||
onig_errmsg_buffer err;
|
||||
VALUE str = rb_reg_preprocess_obj(obj, &fixed_enc, err);
|
||||
if (str == Qnil)
|
||||
rb_raise(rb_eArgError, "%s", err);
|
||||
return rb_assoc_new(str, fixed_enc ? Qtrue : Qfalse);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
|
||||
int options, onig_errmsg_buffer err)
|
||||
{
|
||||
struct RRegexp *re = RREGEXP(obj);
|
||||
int raw8bit;
|
||||
long i;
|
||||
VALUE unescaped;
|
||||
rb_encoding *fixed_enc = 0;
|
||||
|
||||
if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
|
||||
rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
|
||||
|
@ -1253,33 +1660,38 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
|
|||
re->ptr = 0;
|
||||
re->str = 0;
|
||||
|
||||
raw8bit = 0;
|
||||
for (i = 0; i < len; i++) {
|
||||
if (s[i] & 0x80) {
|
||||
raw8bit = 1;
|
||||
break;
|
||||
}
|
||||
unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
|
||||
if (unescaped == Qnil)
|
||||
return -1;
|
||||
|
||||
if (fixed_enc && (options & ARG_ENCODING_FIXED) && fixed_enc != enc) {
|
||||
strcpy(err, "character encodings differ");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (fixed_enc)
|
||||
enc = fixed_enc;
|
||||
else if (!(options & ARG_ENCODING_FIXED))
|
||||
enc = rb_default_encoding();
|
||||
|
||||
rb_enc_associate((VALUE)re, enc);
|
||||
if (options & ARG_ENCODING_FIXED || raw8bit) {
|
||||
if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
|
||||
re->basic.flags |= KCODE_FIXED;
|
||||
}
|
||||
re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
|
||||
re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
|
||||
options & ARG_REG_OPTION_MASK, err);
|
||||
if (!re->ptr) return -1;
|
||||
re->str = ALLOC_N(char, len+1);
|
||||
memcpy(re->str, s, len);
|
||||
re->str[len] = '\0';
|
||||
re->len = len;
|
||||
RB_GC_GUARD(unescaped);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
|
||||
{
|
||||
if (!rb_enc_str_asciionly_p(str)) {
|
||||
options |= ARG_ENCODING_FIXED;
|
||||
}
|
||||
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
|
||||
options, err);
|
||||
}
|
||||
|
@ -2183,6 +2595,10 @@ Init_Regexp(void)
|
|||
rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
|
||||
rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
|
||||
|
||||
#if 0
|
||||
rb_define_singleton_method(rb_cRegexp, "preprocess", rb_reg_preprocess_m, 1);
|
||||
#endif
|
||||
|
||||
rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
|
||||
rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
|
||||
rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
|
||||
|
|
|
@ -25,6 +25,17 @@ class TestM17N < Test::Unit::TestCase
|
|||
assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding)
|
||||
end
|
||||
|
||||
def test_string_mixed_unicode
|
||||
assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) }
|
||||
assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) }
|
||||
assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) }
|
||||
assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) }
|
||||
assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) }
|
||||
assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) }
|
||||
assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) }
|
||||
assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) }
|
||||
end
|
||||
|
||||
def test_regexp_too_short_multibyte_character
|
||||
assert_raise(SyntaxError) { eval('/\xfe/e') }
|
||||
assert_raise(SyntaxError) { eval('/\x8e/e') }
|
||||
|
@ -38,11 +49,12 @@ class TestM17N < Test::Unit::TestCase
|
|||
assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
|
||||
|
||||
# raw 8bit
|
||||
#assert_raise(SyntaxError) { eval("/\xfe/e") }
|
||||
#assert_raise(SyntaxError) { eval("/\xc0/u") }
|
||||
assert_raise(SyntaxError) { eval("/\xfe/e") }
|
||||
assert_raise(SyntaxError) { eval("/\xc0/u") }
|
||||
|
||||
# invalid suffix
|
||||
#assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
|
||||
assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
|
||||
assert_raise(SyntaxError) { eval('/\xc0 /u') }
|
||||
#assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
|
||||
end
|
||||
|
||||
|
@ -94,6 +106,9 @@ class TestM17N < Test::Unit::TestCase
|
|||
def test_regexp_generic
|
||||
assert_regexp_generic_ascii(/a/)
|
||||
assert_regexp_generic_ascii(Regexp.new(a("a")))
|
||||
assert_regexp_generic_ascii(Regexp.new(e("a")))
|
||||
assert_regexp_generic_ascii(Regexp.new(s("a")))
|
||||
assert_regexp_generic_ascii(Regexp.new(u("a")))
|
||||
|
||||
[/a/, Regexp.new(a("a"))].each {|r|
|
||||
assert_equal(0, r =~ a("a"))
|
||||
|
@ -112,7 +127,7 @@ class TestM17N < Test::Unit::TestCase
|
|||
assert_regexp_fixed_ascii8bit(/\xc0\xa1/n)
|
||||
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/})))
|
||||
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n})))
|
||||
# assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
|
||||
assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
|
||||
|
||||
[/a/n].each {|r|
|
||||
assert_equal(0, r =~ a("a"))
|
||||
|
@ -139,12 +154,11 @@ class TestM17N < Test::Unit::TestCase
|
|||
|
||||
def test_regexp_euc
|
||||
assert_regexp_fixed_eucjp(/a/e)
|
||||
assert_regexp_fixed_eucjp(Regexp.new(e("a")))
|
||||
assert_regexp_fixed_eucjp(/\xc0\xa1/e)
|
||||
assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/})))
|
||||
assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/})))
|
||||
|
||||
[/a/e, Regexp.new(e("a"))].each {|r|
|
||||
[/a/e].each {|r|
|
||||
assert_equal(0, r =~ a("a"))
|
||||
assert_equal(0, r =~ e("a"))
|
||||
assert_equal(0, r =~ s("a"))
|
||||
|
@ -169,7 +183,6 @@ class TestM17N < Test::Unit::TestCase
|
|||
|
||||
def test_regexp_sjis
|
||||
assert_regexp_fixed_sjis(/a/s)
|
||||
assert_regexp_fixed_sjis(Regexp.new(s("a")))
|
||||
assert_regexp_fixed_sjis(/\xc0\xa1/s)
|
||||
assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/})))
|
||||
assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/})))
|
||||
|
|
|
@ -68,47 +68,74 @@ EOS
|
|||
def test_regexp
|
||||
|
||||
# Compare regexps to regexps
|
||||
assert_equal(/Yukihiro Matsumoto - 松本行弘/,
|
||||
assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
|
||||
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
|
||||
assert_equal(/Yukihiro Matsumoto - 松本行弘/,
|
||||
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
|
||||
assert_equal(/Matz - まつもと ゆきひろ/,
|
||||
assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
|
||||
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
|
||||
assert_not_equal(/Matz - まつもと ゆきひろ/,
|
||||
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
|
||||
assert_equal(/Aoyama Gakuin University - 青山学院大学/,
|
||||
assert_not_equal(/Aoyama Gakuin University - 青山学院大学/,
|
||||
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
|
||||
assert_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
|
||||
assert_equal(/Martin Dürst/, /Martin D\u00FCrst/)
|
||||
assert_equal(/ü/, /\u00FC/)
|
||||
assert_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
|
||||
assert_equal(/ü/, /\u{FC}/)
|
||||
assert_equal(/ü/, %r{\u{FC}})
|
||||
assert_equal(/ü/i, %r{\u00FC}i)
|
||||
assert_not_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
|
||||
assert_not_equal(/Martin Dürst/, /Martin D\u00FCrst/)
|
||||
assert_not_equal(/ü/, /\u00FC/)
|
||||
assert_not_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
|
||||
assert_not_equal(/ü/, /\u{FC}/)
|
||||
assert_not_equal(/ü/, %r{\u{FC}})
|
||||
assert_not_equal(/ü/i, %r{\u00FC}i)
|
||||
|
||||
assert_equal('Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18',
|
||||
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/.source)
|
||||
assert_equal('Yukihiro Matsumoto - \u{677E 672C 884C 5F18}',
|
||||
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/.source)
|
||||
assert_equal('Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D',
|
||||
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/.source)
|
||||
assert_equal('Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66',
|
||||
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
|
||||
assert_equal('\u9752\u5C71\u5B66\u9662\u5927\u5B66',
|
||||
/\u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
|
||||
assert_equal('Martin D\u00FCrst', /Martin D\u00FCrst/.source)
|
||||
assert_equal('\u00FC', /\u00FC/.source)
|
||||
assert_equal('Martin D\u{FC}rst', /Martin D\u{FC}rst/.source)
|
||||
assert_equal('\u{FC}', /\u{FC}/.source)
|
||||
assert_equal('\u{FC}', %r{\u{FC}}.source)
|
||||
assert_equal('\u00FC', %r{\u00FC}i.source)
|
||||
|
||||
# match strings to regexps
|
||||
assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/, 0)
|
||||
assert_equal("Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/, 0)
|
||||
assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0)
|
||||
assert_equal(%Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0)
|
||||
assert_equal("Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/, 0)
|
||||
assert_equal("Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0)
|
||||
assert_equal("青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0)
|
||||
assert_equal("Martin Dürst" =~ /Martin D\u00FCrst/, 0)
|
||||
assert_equal("ü" =~ /\u00FC/, 0)
|
||||
assert_equal("Martin Dürst" =~ /Martin D\u{FC}rst/, 0)
|
||||
assert_equal("ü" =~ %r{\u{FC}}, 0)
|
||||
assert_equal("ü" =~ %r{\u00FC}i, 0)
|
||||
assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
|
||||
assert_equal(0, "Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/)
|
||||
assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
|
||||
assert_equal(0, %Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
|
||||
assert_equal(0, "Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
|
||||
assert_equal(0, "Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
|
||||
assert_equal(0, "青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
|
||||
assert_equal(0, "Martin Dürst" =~ /Martin D\u00FCrst/)
|
||||
assert_equal(0, "ü" =~ /\u00FC/)
|
||||
assert_equal(0, "Martin Dürst" =~ /Martin D\u{FC}rst/)
|
||||
assert_equal(0, "ü" =~ %r{\u{FC}})
|
||||
assert_equal(0, "ü" =~ %r{\u00FC}i)
|
||||
|
||||
# Flip order of the two operands
|
||||
assert_equal(/Martin D\u00FCrst/ =~ "Martin Dürst", 0)
|
||||
assert_equal(/\u00FC/ =~ "testü", 4)
|
||||
assert_equal(/Martin D\u{FC}rst/ =~ "fooMartin Dürstbar", 3)
|
||||
assert_equal(%r{\u{FC}} =~ "fooübar", 3)
|
||||
assert_equal(0, /Martin D\u00FCrst/ =~ "Martin Dürst")
|
||||
assert_equal(4, /\u00FC/ =~ "testü")
|
||||
assert_equal(3, /Martin D\u{FC}rst/ =~ "fooMartin Dürstbar")
|
||||
assert_equal(3, %r{\u{FC}} =~ "fooübar")
|
||||
|
||||
# Put \u in strings, literal character in regexp
|
||||
assert_equal("Martin D\u00FCrst" =~ /Martin Dürst/, 0)
|
||||
assert_equal("test\u00FC" =~ /ü/, 4)
|
||||
assert_equal("fooMartin D\u{FC}rstbar" =~ /Martin Dürst/, 3)
|
||||
assert_equal(%Q{foo\u{FC}bar} =~ %r<ü>, 3)
|
||||
assert_equal(0, "Martin D\u00FCrst" =~ /Martin Dürst/)
|
||||
assert_equal(4, "test\u00FC" =~ /ü/)
|
||||
assert_equal(3, "fooMartin D\u{FC}rstbar" =~ /Martin Dürst/)
|
||||
assert_equal(3, %Q{foo\u{FC}bar} =~ %r<ü>)
|
||||
|
||||
assert_match(eval('/\u{2a}/'), "*")
|
||||
assert_raise(SyntaxError) { eval('/\u{6666}/n') }
|
||||
assert_raise(SyntaxError) { eval('/\u{6666}/e') }
|
||||
assert_raise(SyntaxError) { eval('/\u{6666}/s') }
|
||||
assert_nothing_raised { eval('/\u{6666}/u') }
|
||||
end
|
||||
|
||||
def test_dynamic_regexp
|
||||
assert_match(Regexp.new("Martin D\\u{FC}rst"), "Martin Dürst")
|
||||
end
|
||||
|
||||
def test_syntax_variants
|
||||
|
|
Loading…
Reference in a new issue