diff --git a/ChangeLog b/ChangeLog index 84f9c1e43b..cce0ff4ccd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,31 @@ +Sun Dec 2 01:39:51 2007 Tanaka Akira + + * include/ruby/intern.h (rb_uv_to_utf8): declared. + + * re.c (rb_reg_preprocess): new function for dynamic regexp with + \u{} such as Regexp.new("\\u{6666}"). + (rb_reg_prepare_re): preprocess regexp for recompiling. + (read_escaped_byte): new function. + (unescape_escaped_nonascii): new function. + (append_utf8): new function. + (unescape_unicode_list): new function. + (unescape_unicode_bmp): new function. + (unescape_nonascii): new function. + (rb_reg_initialize): preprocess regexp. + + * pack.c (rb_uv_to_utf8): renamed from uv_to_utf8. + + * parse.y (STR_NEW3): take func instead of has8 and hasmb. + (parser_str_new): use default coderange mechanism except for regexp. + (parser_tokadd_utf8): copy regexp source as-is. + (parser_read_escape): UTF-8 stuff removed. + (parser_tokadd_escape): has8bit and hasmb removed. + (parser_tokadd_string): fix 8-bit single byte character with \u. + (parser_parse_string): has8bit and hasmb removed. + (parser_here_document): has8bit and hasmb removed. + (parser_yylex): call parser_tokadd_utf8 instead of read_escape for + UTF-8 character. + Wed Dec 2 01:00:07 2007 James Edward Gray II * lib/xmlrpc/server.rb (XMLRPC::Server#server): Improve signal handling so diff --git a/include/ruby/intern.h b/include/ruby/intern.h index a43d4d31ab..42f61a91bc 100644 --- a/include/ruby/intern.h +++ b/include/ruby/intern.h @@ -101,6 +101,7 @@ unsigned LONG_LONG rb_big2ull(VALUE); #endif /* HAVE_LONG_LONG */ void rb_quad_pack(char*,VALUE); VALUE rb_quad_unpack(const char*,int); +int rb_uv_to_utf8(char[6],unsigned long); VALUE rb_dbl2big(double); double rb_big2dbl(VALUE); VALUE rb_big_cmp(VALUE, VALUE); diff --git a/pack.c b/pack.c index 4cab476bd2..b1d7268850 100644 --- a/pack.c +++ b/pack.c @@ -365,7 +365,6 @@ static const char toofew[] = "too few arguments"; static void encodes(VALUE,const char*,long,int); static void qpencode(VALUE,VALUE,long); -static int uv_to_utf8(char*,unsigned long); static unsigned long utf8_to_uv(const char*,long*); /* @@ -872,7 +871,7 @@ pack_pack(VALUE ary, VALUE fmt) if (l < 0) { rb_raise(rb_eRangeError, "pack(U): value out of range"); } - le = uv_to_utf8(buf, l); + le = rb_uv_to_utf8(buf, l); rb_str_buf_cat(res, (char*)buf, le); } break; @@ -1991,8 +1990,8 @@ pack_unpack(VALUE str, VALUE fmt) #define BYTEWIDTH 8 -static int -uv_to_utf8(char *buf, unsigned long uv) +int +rb_uv_to_utf8(char buf[6], unsigned long uv) { if (uv <= 0x7f) { buf[0] = (char)uv; diff --git a/parse.y b/parse.y index 7f5af4e4e6..264a3d2d63 100644 --- a/parse.y +++ b/parse.y @@ -269,7 +269,7 @@ struct parser_params { #define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc) #define STR_NEW0() rb_str_new(0,0) #define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc) -#define STR_NEW3(p,n,e,has8,hasmb) parser_str_new2((p),(n),(e),(has8),(hasmb)) +#define STR_NEW3(p,n,e,func) parser_str_new((p),(n),(e),(func)) #define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0)) #define ENC_SINGLE(cr) ((cr)==ENC_CODERANGE_7BIT) #define TOK_INTERN(mb) rb_intern3(tok(), toklen(), STR_ENC(mb)) @@ -4488,7 +4488,7 @@ none : /* none */ # define yylval (*((YYSTYPE*)(parser->parser_yylval))) static int parser_regx_options(struct parser_params*); -static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*,int*,rb_encoding**); +static int parser_tokadd_string(struct parser_params*,int,int,int,long*,rb_encoding**); static void parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc); static int parser_parse_string(struct parser_params*,NODE*); static int parser_here_document(struct parser_params*,NODE*); @@ -4500,11 +4500,10 @@ static int parser_here_document(struct parser_params*,NODE*); # define tokspace(n) parser_tokspace(parser, n) # define tokadd(c) parser_tokadd(parser, c) # define tok_hex(numlen) parser_tok_hex(parser, numlen) -# define tok_utf8(numlen,e) parser_tok_utf8(parser, numlen, e) -# define read_escape(flags,has8,hasmb,e) parser_read_escape(parser, flags, has8, hasmb, e) -# define tokadd_escape(t,has8,hasmb,e) parser_tokadd_escape(parser, t, has8,hasmb, e) +# define read_escape(flags,e) parser_read_escape(parser, flags, e) +# define tokadd_escape(t,e) parser_tokadd_escape(parser, t, e) # define regx_options() parser_regx_options(parser) -# define tokadd_string(f,t,p,n,has8bit,hasmb,e) parser_tokadd_string(parser,f,t,p,n,has8bit,hasmb,e) +# define tokadd_string(f,t,p,n,e) parser_tokadd_string(parser,f,t,p,n,e) # define parse_string(n) parser_parse_string(parser,n) # define tokaddmbc(c, enc) parser_tokaddmbc(parser, c, enc) # define here_document(n) parser_here_document(parser,n) @@ -4821,37 +4820,39 @@ rb_parser_compile_file(volatile VALUE vparser, const char *f, VALUE file, int st } #endif /* !RIPPER */ +#define STR_FUNC_ESCAPE 0x01 +#define STR_FUNC_EXPAND 0x02 +#define STR_FUNC_REGEXP 0x04 +#define STR_FUNC_QWORDS 0x08 +#define STR_FUNC_SYMBOL 0x10 +#define STR_FUNC_INDENT 0x20 + +enum string_type { + str_squote = (0), + str_dquote = (STR_FUNC_EXPAND), + str_xquote = (STR_FUNC_EXPAND), + str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND), + str_sword = (STR_FUNC_QWORDS), + str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND), + str_ssym = (STR_FUNC_SYMBOL), + str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND), +}; + static VALUE -parser_str_new(const char *p, long n, rb_encoding *enc, int coderange) +parser_str_new(const char *p, long n, rb_encoding *enc, int func) { - VALUE str = rb_enc_str_new(p, n, enc); - ENC_CODERANGE_SET(str, coderange); + VALUE str; + + str = rb_enc_str_new(p, n, enc); + if (!(func & STR_FUNC_REGEXP) && + rb_enc_asciicompat(enc) && + rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { + rb_enc_associate(str, rb_default_encoding()); + } + return str; } -static VALUE -parser_str_new2(const char *p, long n, rb_encoding *enc, int has8bit,int hasmb) -{ - /* - * Set coderange bit flags based on the presence of 8-bit and - * multi-byte characters in the string - */ - int coderange = ENC_CODERANGE_7BIT; - if (hasmb) coderange = ENC_CODERANGE_8BIT; - else if (has8bit) coderange = ENC_CODERANGE_UNKNOWN; - - /* - * If it is all single byte characters with the 8th bit clear, - * and if the specified encoding is ASCII-compatible, then this - * string is in the ASCII subset, and we just use the ASCII encoding - * instead. - */ - if ((coderange == ENC_CODERANGE_7BIT) && rb_enc_asciicompat(enc)) - enc = rb_default_encoding(); - - return parser_str_new(p, n, enc, coderange); -} - static inline int parser_nextc(struct parser_params *parser) { @@ -4979,9 +4980,11 @@ parser_tok_hex(struct parser_params *parser, int *numlen) return c; } +#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n)) + static int -parser_tokadd_utf8(struct parser_params *parser, int *hasmb, - rb_encoding **encp, int string_literal, int symbol_literal) +parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp, + int string_literal, int symbol_literal, int regexp_literal) { /* * If string_literal is true, then we allow multiple codepoints @@ -4993,8 +4996,11 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb, int codepoint; int numlen; + if (regexp_literal) { tokadd('\\'); tokadd('u'); } + if (peek('{')) { /* handle \u{...} form */ do { + if (regexp_literal) { tokadd(*lex_p); } nextc(); codepoint = scan_hex(lex_p, 6, &numlen); if (numlen == 0) { @@ -5006,8 +5012,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb, return 0; } lex_p += numlen; - if (codepoint >= 0x80) { - *hasmb = 1; + if (regexp_literal) { + tokcopy(numlen); + } + else if (codepoint >= 0x80) { *encp = UTF8_ENC(); if (string_literal) tokaddmbc(codepoint, *encp); } @@ -5026,6 +5034,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb, return 0; } + if (regexp_literal) { tokadd('}'); } nextc(); } else { /* handle \uxxxx form */ @@ -5035,8 +5044,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb, return 0; } lex_p += 4; - if (codepoint >= 0x80) { - *hasmb = 1; + if (regexp_literal) { + tokcopy(4); + } + else if (codepoint >= 0x80) { *encp = UTF8_ENC(); if (string_literal) tokaddmbc(codepoint, *encp); } @@ -5058,7 +5069,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb, static int parser_read_escape(struct parser_params *parser, int flags, - int *has8bit, int *hasmb, rb_encoding **encp) + rb_encoding **encp) { int c; int numlen; @@ -5098,19 +5109,12 @@ parser_read_escape(struct parser_params *parser, int flags, c = scan_oct(lex_p, 3, &numlen); lex_p += numlen; } - if (c >= 0200) *has8bit = 1; return c; case 'x': /* hex constant */ if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; c = tok_hex(&numlen); if (numlen == 0) return 0; - if (c >= 0x80) *has8bit = 1; - return c; - - case 'u': /* unicode constant: here only for char literal */ - if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; - c = parser_tokadd_utf8(parser, hasmb, encp, 0, 0); return c; case 'b': /* backspace */ @@ -5126,13 +5130,10 @@ parser_read_escape(struct parser_params *parser, int flags, goto eof; } if ((c = nextc()) == '\\') { - int tmp; - *has8bit = 1; - return read_escape(flags|ESCAPE_META, &tmp, &tmp, encp) | 0x80; + return read_escape(flags|ESCAPE_META, encp) | 0x80; } else if (c == -1 || !ISASCII(c)) goto eof; else { - *has8bit = 1; return ((c & 0xff) | 0x80); } @@ -5144,8 +5145,7 @@ parser_read_escape(struct parser_params *parser, int flags, case 'c': if (flags & ESCAPE_CONTROL) goto eof; if ((c = nextc())== '\\') { - int tmp; - c = read_escape(flags|ESCAPE_CONTROL, has8bit, &tmp, encp); + c = read_escape(flags|ESCAPE_CONTROL, encp); } else if (c == '?') return 0177; @@ -5162,8 +5162,6 @@ parser_read_escape(struct parser_params *parser, int flags, } } -#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n)) - static void parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc) { @@ -5173,7 +5171,7 @@ parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc) static int parser_tokadd_escape(struct parser_params *parser, int term, - int *has8bit, int *hasmb, rb_encoding **encp) + rb_encoding **encp) { int c; int flags = 0; @@ -5194,7 +5192,6 @@ parser_tokadd_escape(struct parser_params *parser, int term, if (numlen == 0) goto eof; lex_p += numlen; tokcopy(numlen + 1); - if (oct >= 0200) *has8bit = 1; } return 0; @@ -5207,7 +5204,6 @@ parser_tokadd_escape(struct parser_params *parser, int term, hex = tok_hex(&numlen); if (numlen == 0) goto eof; tokcopy(numlen + 2); - if (hex >= 0x80) *has8bit = 1; } return 0; @@ -5218,7 +5214,6 @@ parser_tokadd_escape(struct parser_params *parser, int term, goto eof; } tokcopy(3); - *has8bit = 1; flags |= ESCAPE_META; goto escaped; @@ -5287,24 +5282,6 @@ parser_regx_options(struct parser_params *parser) return options | RE_OPTION_ENCODING(kcode); } -#define STR_FUNC_ESCAPE 0x01 -#define STR_FUNC_EXPAND 0x02 -#define STR_FUNC_REGEXP 0x04 -#define STR_FUNC_QWORDS 0x08 -#define STR_FUNC_SYMBOL 0x10 -#define STR_FUNC_INDENT 0x20 - -enum string_type { - str_squote = (0), - str_dquote = (STR_FUNC_EXPAND), - str_xquote = (STR_FUNC_EXPAND), - str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND), - str_sword = (STR_FUNC_QWORDS), - str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND), - str_ssym = (STR_FUNC_SYMBOL), - str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND), -}; - static void dispose_string(VALUE str) { @@ -5328,10 +5305,10 @@ parser_tokadd_mbchar(struct parser_params *parser, int c) static int parser_tokadd_string(struct parser_params *parser, int func, int term, int paren, long *nest, - int *has8bit, int *hasmb, rb_encoding **encp) + rb_encoding **encp) { int c; - int has_mb = 0; + int has_nonascii = 0; rb_encoding *enc = *encp; char *errbuf = 0; static const char mixed_msg[] = "%s mixed within %s source"; @@ -5390,9 +5367,10 @@ parser_tokadd_string(struct parser_params *parser, tokadd('\\'); break; } - parser_tokadd_utf8(parser, hasmb, &enc, 1, - func & STR_FUNC_SYMBOL); - if (has_mb && enc != *encp) { + parser_tokadd_utf8(parser, &enc, 1, + func & STR_FUNC_SYMBOL, + func & STR_FUNC_REGEXP); + if (has_nonascii && enc != *encp) { mixed_escape(beg, enc, *encp); } continue; @@ -5400,28 +5378,17 @@ parser_tokadd_string(struct parser_params *parser, default: if (func & STR_FUNC_REGEXP) { pushback(c); - if ((c = tokadd_escape(term, has8bit, hasmb, &enc)) < 0) + if ((c = tokadd_escape(term, &enc)) < 0) return -1; - if (has_mb && enc != *encp) { + if (has_nonascii && enc != *encp) { mixed_escape(beg, enc, *encp); } continue; } else if (func & STR_FUNC_EXPAND) { - int tmb = 0; pushback(c); if (func & STR_FUNC_ESCAPE) tokadd('\\'); - c = read_escape(0, has8bit, &tmb, &enc); - if (tmb) { - *hasmb = tmb; - if (has_mb && enc != *encp) { - mixed_escape(beg, enc, *encp); - } - else { - tokaddmbc(c, enc); - } - continue; - } + c = read_escape(0, &enc); } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { /* ignore backslashed spaces in %w */ @@ -5432,13 +5399,12 @@ parser_tokadd_string(struct parser_params *parser, } } else if (parser_ismbchar()) { - has_mb = 1; + has_nonascii = 1; if (enc != *encp) { mixed_error(enc, *encp); continue; } tokadd_mbchar(c); - if (hasmb) *hasmb = 1; continue; } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { @@ -5450,6 +5416,13 @@ parser_tokadd_string(struct parser_params *parser, compile_error(PARSER_ARG "symbol cannot contain '\\0'"); continue; } + if (c & 0x80) { + has_nonascii = 1; + if (enc != *encp) { + mixed_error(enc, *encp); + continue; + } + } tokadd(c); } *encp = enc; @@ -5465,7 +5438,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) int func = quote->nd_func; int term = nd_term(quote); int paren = nd_paren(quote); - int c, space = 0, has8bit=0, hasmb=0; + int c, space = 0; rb_encoding *enc = parser->enc; if (func == -1) return tSTRING_END; @@ -5501,7 +5474,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) } pushback(c); if (tokadd_string(func, term, paren, "e->nd_nest, - &has8bit, &hasmb, &enc) == -1) { + &enc) == -1) { ruby_sourceline = nd_line(quote); if (func & STR_FUNC_REGEXP) { compile_error(PARSER_ARG "unterminated regexp meets end of file"); @@ -5514,7 +5487,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) } tokfix(); - set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb)); + set_yylval_str(STR_NEW3(tok(), toklen(), enc, func)); return tSTRING_CONTENT; } @@ -5678,7 +5651,6 @@ parser_here_document(struct parser_params *parser, NODE *here) } else { /* int mb = ENC_CODERANGE_7BIT, *mbp = &mb;*/ - int has8bit=0, hasmb=0; rb_encoding *enc = parser->enc; newtok(); if (c == '#') { @@ -5695,16 +5667,16 @@ parser_here_document(struct parser_params *parser, NODE *here) do { pushback(c); if ((c = tokadd_string(func, '\n', 0, NULL, - &has8bit, &hasmb, &enc)) == -1) goto error; + &enc)) == -1) goto error; if (c != '\n') { - set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit,hasmb)); + set_yylval_str(STR_NEW3(tok(), toklen(), enc, func)); return tSTRING_CONTENT; } tokadd(nextc()); /* if (mbp && mb == ENC_CODERANGE_UNKNOWN) mbp = 0;*/ if ((c = nextc()) == -1) goto error; } while (!whole_match_p(eos, len, indent)); - str = STR_NEW3(tok(), toklen(), enc, has8bit,hasmb); + str = STR_NEW3(tok(), toklen(), enc, func); } heredoc_restore(lex_strterm); lex_strterm = NEW_STRTERM(-1, 0, 0); @@ -5966,7 +5938,6 @@ parser_yylex(struct parser_params *parser) int cmd_state; enum lex_state_e last_state; rb_encoding *enc; - int has8bit = 0, hasmb = 0; int mb; #ifdef RIPPER int fallthru = Qfalse; @@ -6317,26 +6288,33 @@ parser_yylex(struct parser_params *parser) newtok(); enc = parser->enc; if (parser_ismbchar()) { - hasmb = 1; tokadd_mbchar(c); } else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) { goto ternary; } - else if (c == '\\' && (c = read_escape(0, &has8bit, &hasmb, &enc)) >= 0x80) { - if (hasmb) { - tokaddmbc(c, enc); - } - else { - tokadd(c); - } - } - else { + else if (c == '\\') { + if (peek('u')) { + nextc(); + c = parser_tokadd_utf8(parser, &enc, 0, 0, 0); + if (0x80 <= c) { + tokaddmbc(c, enc); + } + else { + tokadd(c); + } + } + else { + c = read_escape(0, &enc); + tokadd(c); + } + } + else { tokadd(c); - } + } tokfix(); - set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb)); + set_yylval_str(STR_NEW3(tok(), toklen(), enc, 0)); lex_state = EXPR_ENDARG; return tCHAR; @@ -8481,7 +8459,6 @@ reg_compile_gen(struct parser_params* parser, VALUE str, int options) compile_error(PARSER_ARG "%s", RSTRING_PTR(re)); return Qnil; } - if (str) rb_enc_copy(re, str); return re; } diff --git a/re.c b/re.c index 398e748f3a..78cfd018f5 100644 --- a/re.c +++ b/re.c @@ -12,6 +12,7 @@ #include "ruby/ruby.h" #include "ruby/re.h" #include "ruby/encoding.h" +#include "ruby/util.h" #include "regint.h" #include @@ -715,6 +716,10 @@ rb_reg_fixed_encoding_p(VALUE re) return Qfalse; } +static VALUE +rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, + rb_encoding **fixed_enc, onig_errmsg_buffer err); + static void rb_reg_prepare_re(VALUE re, VALUE str) { @@ -740,13 +745,19 @@ rb_reg_prepare_re(VALUE re, VALUE str) OnigErrorInfo einfo; regex_t *reg, *reg2; UChar *pattern; + VALUE unescaped; + rb_encoding *fixed_enc = 0; rb_reg_check(re); reg = RREGEXP(re)->ptr; pattern = ((UChar*)RREGEXP(re)->str); - r = onig_new(®2, (UChar* )pattern, - (UChar* )(pattern + RREGEXP(re)->len), + unescaped = rb_reg_preprocess( + RREGEXP(re)->str, RREGEXP(re)->str + RREGEXP(re)->len, enc, + &fixed_enc, err); + + r = onig_new(®2, (UChar* )RSTRING_PTR(unescaped), + (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)), reg->options, enc, OnigDefaultSyntax, &einfo); if (r) { @@ -756,6 +767,7 @@ rb_reg_prepare_re(VALUE re, VALUE str) RREGEXP(re)->ptr = reg2; onig_free(reg); + RB_GC_GUARD(unescaped); } } @@ -1235,13 +1247,408 @@ match_inspect(VALUE match) VALUE rb_cRegexp; +static int +read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err) +{ + const char *p = *pp; + int code; + int meta_prefix = 0, ctrl_prefix = 0; + int len; + int retbyte; + + retbyte = -1; + if (p == end || *p++ != '\\') { + strcpy(err, "too short escaped multibyte character"); + return -1; + } + +again: + if (p == end) { + strcpy(err, "too short escape sequence"); + return -1; + } + switch (*p++) { + case '\\': code = '\\'; break; + case 'n': code = '\n'; break; + case 't': code = '\t'; break; + case 'r': code = '\r'; break; + case 'f': code = '\f'; break; + case 'v': code = '\013'; break; + case 'a': code = '\007'; break; + case 'e': code = '\033'; break; + + /* \OOO */ + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + p--; + code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len); + p += len; + break; + + case 'x': /* \xHH */ + code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len); + if (len < 1) { + strcpy(err, "invalid hex escape"); + return -1; + } + p += len; + break; + + case 'M': /* \M-X, \M-\C-X, \M-\cX */ + if (meta_prefix) { + strcpy(err, "duplicate meta escape"); + return -1; + } + meta_prefix = 1; + if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) { + if (*p == '\\') { + p++; + goto again; + } + else { + code = *p++; + break; + } + } + strcpy(err, "too short meta escape"); + return -1; + + case 'C': /* \C-X, \C-\M-X */ + if (p == end || *p++ != '-') { + strcpy(err, "too short control escape"); + return -1; + } + case 'c': /* \cX, \c\M-X */ + if (ctrl_prefix) { + strcpy(err, "duplicate control escape"); + return -1; + } + ctrl_prefix = 1; + if (p < end && (*p & 0x80) == 0) { + if (*p == '\\') { + p++; + goto again; + } + else { + code = *p++; + break; + } + } + strcpy(err, "too short control escape"); + return -1; + + default: + strcpy(err, "unexpected escape sequence"); + return -1; + } + if (code < 0 || 0xff < code) { + strcpy(err, "invalid escape code"); + return -1; + } + + if (ctrl_prefix) + code &= 0x1f; + if (meta_prefix) + code |= 0x80; + + *pp = p; + return code; +} + +static int +unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, + VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) +{ + const char *p = *pp; + int chmaxlen = rb_enc_mbmaxlen(enc); + char *chbuf = ALLOCA_N(char, chmaxlen); + int chlen = 0; + int byte; + + memset(chbuf, 0, chmaxlen); + + byte = read_escaped_byte(&p, end, err); + if (byte == -1) { + return -1; + } + + chbuf[chlen++] = byte; + while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) { + byte = read_escaped_byte(&p, end, err); + if (byte == -1) { + return -1; + } + chbuf[chlen++] = byte; + } + + if (chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) { + strcpy(err, "invalid multibyte escape"); + return -1; + } + + if (1 < chlen || (chbuf[0] & 0x80)) { + rb_str_buf_cat(buf, chbuf, chlen); + + if (*encp == 0) + *encp = enc; + else if (*encp != enc) { + strcpy(err, "character encodings differ"); + return -1; + } + } + else { + char escbuf[5]; + snprintf(escbuf, sizeof(escbuf), "\\x%02x", chbuf[0]&0xff); + rb_str_buf_cat(buf, escbuf, 4); + } + *pp = p; + return 0; +} + +static int +append_utf8(unsigned long uv, + VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) +{ + if (uv < 0x80) { + char escbuf[5]; + snprintf(escbuf, sizeof(escbuf), "\\x%02x", (int)uv); + rb_str_buf_cat(buf, escbuf, 4); + } + else { + int len; + char utf8buf[6]; + len = rb_uv_to_utf8(utf8buf, uv); + rb_str_buf_cat(buf, utf8buf, len); + + if (*encp == 0) + *encp = rb_enc_find("utf-8"); + else if (*encp != rb_enc_find("utf-8")) { + strcpy(err, "character encodings differ"); + return -1; + } + } + return 0; +} + +static int +unescape_unicode_list(const char **pp, const char *end, + VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) +{ + const char *p = *pp; + int has_unicode = 0; + unsigned long code; + int len; + + while (p < end && ISSPACE(*p)) p++; + + while (1) { + code = ruby_scan_hex(p, end-p, &len); + if (len == 0) + break; + if (6 < len) { /* max 10FFFF */ + strcpy(err, "invalid unicode range"); + return -1; + } + if (0x10ffff < code) { + strcpy(err, "invalid unicode range"); + return -1; + } + p += len; + if (append_utf8(code, buf, encp, err) != 0) + return -1; + has_unicode = 1; + + while (p < end && ISSPACE(*p)) p++; + } + + if (has_unicode == 0) { + strcpy(err, "invalid unicode list"); + return -1; + } + + *pp = p; + + return 0; +} + +static int +unescape_unicode_bmp(const char **pp, const char *end, + VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) +{ + const char *p = *pp; + int len; + unsigned long code; + + if (end < p+4) { + strcpy(err, "invalid unicode escape"); + return -1; + } + code = ruby_scan_hex(p, 4, &len); + if (len != 4) { + strcpy(err, "invalid unicode escape"); + return -1; + } + if (append_utf8(code, buf, encp, err) != 0) + return -1; + *pp = p + 4; + return 0; +} + +static int +unescape_nonascii(const char *p, const char *end, rb_encoding *enc, + VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) +{ + char c; + char smallbuf[2]; + + while (p < end) { + int chlen = mbclen(p, end, enc); + if (1 < chlen || (*p & 0x80)) { + if (end < p + chlen) { + strcpy(err, "too short multibyte character"); + return -1; + } + /* xxx: validate the non-ascii character */ + rb_str_buf_cat(buf, p, chlen); + p += chlen; + if (*encp == 0) + *encp = enc; + else if (*encp != enc) { + strcpy(err, "character encodings differ"); + return -1; + } + continue; + } + + switch (c = *p++) { + case '\\': + if (p == end) { + strcpy(err, "too short escape sequence"); + return -1; + } + switch (c = *p++) { + case '1': case '2': case '3': + case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */ + { + int octlen; + if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) { + /* backref or 7bit octal. + no need to unescape anyway. + re-escaping may break backref */ + goto escape_asis; + } + } + /* xxx: How about more than 199 subexpressions? */ + + case '0': /* \0, \0O, \0OO */ + + case 'x': /* \xHH */ + case 'c': /* \cX, \c\M-X */ + case 'C': /* \C-X, \C-\M-X */ + case 'M': /* \M-X, \M-\C-X, \M-\cX */ + p = p-2; + if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0) + return -1; + break; + + case 'u': + if (p == end) { + strcpy(err, "too short escape sequence"); + return -1; + } + if (*p == '{') { + /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */ + p++; + if (unescape_unicode_list(&p, end, buf, encp, err) != 0) + return -1; + if (p == end || *p++ != '}') { + strcpy(err, "invalid unicode list"); + return -1; + } + break; + } + else { + /* \uHHHH */ + if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0) + return -1; + break; + } + + default: /* \n, \\, \d, \9, etc. */ +escape_asis: + smallbuf[0] = '\\'; + smallbuf[1] = c; + rb_str_buf_cat(buf, smallbuf, 2); + break; + } + break; + + default: + rb_str_buf_cat(buf, &c, 1); + break; + } + } + + return 0; +} + +static VALUE +rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, + rb_encoding **fixed_enc, onig_errmsg_buffer err) +{ + VALUE buf; + + buf = rb_str_buf_new(0); + + *fixed_enc = 0; + if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0) + return Qnil; + + if (fixed_enc) { + rb_enc_associate(buf, *fixed_enc); + } + + return buf; +} + +#if 0 +static VALUE +rb_reg_preprocess_obj(VALUE str, + rb_encoding **fixed_enc, onig_errmsg_buffer err) +{ + VALUE buf; + char *p, *end; + rb_encoding *enc; + + StringValue(str); + p = RSTRING_PTR(str); + end = p + RSTRING_LEN(str); + enc = rb_enc_get(str); + + buf = rb_reg_preprocess(p, end, enc, fixed_enc, err); + RB_GC_GUARD(str); + return buf; +} + +static VALUE +rb_reg_preprocess_m(VALUE klass, VALUE obj) +{ + rb_encoding *fixed_enc = 0; + onig_errmsg_buffer err; + VALUE str = rb_reg_preprocess_obj(obj, &fixed_enc, err); + if (str == Qnil) + rb_raise(rb_eArgError, "%s", err); + return rb_assoc_new(str, fixed_enc ? Qtrue : Qfalse); +} +#endif + static int rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, int options, onig_errmsg_buffer err) { struct RRegexp *re = RREGEXP(obj); - int raw8bit; - long i; + VALUE unescaped; + rb_encoding *fixed_enc = 0; if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4) rb_raise(rb_eSecurityError, "Insecure: can't modify regexp"); @@ -1253,33 +1660,38 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, re->ptr = 0; re->str = 0; - raw8bit = 0; - for (i = 0; i < len; i++) { - if (s[i] & 0x80) { - raw8bit = 1; - break; - } + unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err); + if (unescaped == Qnil) + return -1; + + if (fixed_enc && (options & ARG_ENCODING_FIXED) && fixed_enc != enc) { + strcpy(err, "character encodings differ"); + return -1; } + if (fixed_enc) + enc = fixed_enc; + else if (!(options & ARG_ENCODING_FIXED)) + enc = rb_default_encoding(); + rb_enc_associate((VALUE)re, enc); - if (options & ARG_ENCODING_FIXED || raw8bit) { + if ((options & ARG_ENCODING_FIXED) || fixed_enc) { re->basic.flags |= KCODE_FIXED; } - re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err); + re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc, + options & ARG_REG_OPTION_MASK, err); if (!re->ptr) return -1; re->str = ALLOC_N(char, len+1); memcpy(re->str, s, len); re->str[len] = '\0'; re->len = len; + RB_GC_GUARD(unescaped); return 0; } static int rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err) { - if (!rb_enc_str_asciionly_p(str)) { - options |= ARG_ENCODING_FIXED; - } return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str), options, err); } @@ -2183,6 +2595,10 @@ Init_Regexp(void) rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1); rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1); +#if 0 + rb_define_singleton_method(rb_cRegexp, "preprocess", rb_reg_preprocess_m, 1); +#endif + rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1); rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1); rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0); diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index fd183967a3..bb0a8a5010 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -25,6 +25,17 @@ class TestM17N < Test::Unit::TestCase assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding) end + def test_string_mixed_unicode + assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) } + assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) } + assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) } + assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) } + assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) } + assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) } + assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) } + assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) } + end + def test_regexp_too_short_multibyte_character assert_raise(SyntaxError) { eval('/\xfe/e') } assert_raise(SyntaxError) { eval('/\x8e/e') } @@ -38,11 +49,12 @@ class TestM17N < Test::Unit::TestCase assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } # raw 8bit - #assert_raise(SyntaxError) { eval("/\xfe/e") } - #assert_raise(SyntaxError) { eval("/\xc0/u") } + assert_raise(SyntaxError) { eval("/\xfe/e") } + assert_raise(SyntaxError) { eval("/\xc0/u") } # invalid suffix - #assert_raise(SyntaxError) { eval('/\xc0\xff/u') } + assert_raise(SyntaxError) { eval('/\xc0\xff/u') } + assert_raise(SyntaxError) { eval('/\xc0 /u') } #assert_raise(SyntaxError) { eval('/\xc0\x20/u') } end @@ -94,6 +106,9 @@ class TestM17N < Test::Unit::TestCase def test_regexp_generic assert_regexp_generic_ascii(/a/) assert_regexp_generic_ascii(Regexp.new(a("a"))) + assert_regexp_generic_ascii(Regexp.new(e("a"))) + assert_regexp_generic_ascii(Regexp.new(s("a"))) + assert_regexp_generic_ascii(Regexp.new(u("a"))) [/a/, Regexp.new(a("a"))].each {|r| assert_equal(0, r =~ a("a")) @@ -112,7 +127,7 @@ class TestM17N < Test::Unit::TestCase assert_regexp_fixed_ascii8bit(/\xc0\xa1/n) assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/}))) assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n}))) - # assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/}))) + assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/}))) [/a/n].each {|r| assert_equal(0, r =~ a("a")) @@ -139,12 +154,11 @@ class TestM17N < Test::Unit::TestCase def test_regexp_euc assert_regexp_fixed_eucjp(/a/e) - assert_regexp_fixed_eucjp(Regexp.new(e("a"))) assert_regexp_fixed_eucjp(/\xc0\xa1/e) assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/}))) assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/}))) - [/a/e, Regexp.new(e("a"))].each {|r| + [/a/e].each {|r| assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) @@ -169,7 +183,6 @@ class TestM17N < Test::Unit::TestCase def test_regexp_sjis assert_regexp_fixed_sjis(/a/s) - assert_regexp_fixed_sjis(Regexp.new(s("a"))) assert_regexp_fixed_sjis(/\xc0\xa1/s) assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/}))) assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/}))) diff --git a/test/ruby/test_unicode_escape.rb b/test/ruby/test_unicode_escape.rb index 46413cdcdb..a1800c66e6 100644 --- a/test/ruby/test_unicode_escape.rb +++ b/test/ruby/test_unicode_escape.rb @@ -68,47 +68,74 @@ EOS def test_regexp # Compare regexps to regexps - assert_equal(/Yukihiro Matsumoto - 松本行弘/, + assert_not_equal(/Yukihiro Matsumoto - 松本行弘/, /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/) - assert_equal(/Yukihiro Matsumoto - 松本行弘/, - /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/) - assert_equal(/Matz - まつもと ゆきひろ/, + assert_not_equal(/Yukihiro Matsumoto - 松本行弘/, + /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/) + assert_not_equal(/Matz - まつもと ゆきひろ/, /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/) - assert_equal(/Aoyama Gakuin University - 青山学院大学/, + assert_not_equal(/Aoyama Gakuin University - 青山学院大学/, /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/) - assert_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/) - assert_equal(/Martin Dürst/, /Martin D\u00FCrst/) - assert_equal(/ü/, /\u00FC/) - assert_equal(/Martin Dürst/, /Martin D\u{FC}rst/) - assert_equal(/ü/, /\u{FC}/) - assert_equal(/ü/, %r{\u{FC}}) - assert_equal(/ü/i, %r{\u00FC}i) + assert_not_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/) + assert_not_equal(/Martin Dürst/, /Martin D\u00FCrst/) + assert_not_equal(/ü/, /\u00FC/) + assert_not_equal(/Martin Dürst/, /Martin D\u{FC}rst/) + assert_not_equal(/ü/, /\u{FC}/) + assert_not_equal(/ü/, %r{\u{FC}}) + assert_not_equal(/ü/i, %r{\u00FC}i) + + assert_equal('Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18', + /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/.source) + assert_equal('Yukihiro Matsumoto - \u{677E 672C 884C 5F18}', + /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/.source) + assert_equal('Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D', + /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/.source) + assert_equal('Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66', + /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/.source) + assert_equal('\u9752\u5C71\u5B66\u9662\u5927\u5B66', + /\u9752\u5C71\u5B66\u9662\u5927\u5B66/.source) + assert_equal('Martin D\u00FCrst', /Martin D\u00FCrst/.source) + assert_equal('\u00FC', /\u00FC/.source) + assert_equal('Martin D\u{FC}rst', /Martin D\u{FC}rst/.source) + assert_equal('\u{FC}', /\u{FC}/.source) + assert_equal('\u{FC}', %r{\u{FC}}.source) + assert_equal('\u00FC', %r{\u00FC}i.source) # match strings to regexps - assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/, 0) - assert_equal("Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/, 0) - assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0) - assert_equal(%Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0) - assert_equal("Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/, 0) - assert_equal("Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0) - assert_equal("青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0) - assert_equal("Martin Dürst" =~ /Martin D\u00FCrst/, 0) - assert_equal("ü" =~ /\u00FC/, 0) - assert_equal("Martin Dürst" =~ /Martin D\u{FC}rst/, 0) - assert_equal("ü" =~ %r{\u{FC}}, 0) - assert_equal("ü" =~ %r{\u00FC}i, 0) + assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/) + assert_equal(0, "Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/) + assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/) + assert_equal(0, %Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/) + assert_equal(0, "Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/) + assert_equal(0, "Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/) + assert_equal(0, "青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/) + assert_equal(0, "Martin Dürst" =~ /Martin D\u00FCrst/) + assert_equal(0, "ü" =~ /\u00FC/) + assert_equal(0, "Martin Dürst" =~ /Martin D\u{FC}rst/) + assert_equal(0, "ü" =~ %r{\u{FC}}) + assert_equal(0, "ü" =~ %r{\u00FC}i) # Flip order of the two operands - assert_equal(/Martin D\u00FCrst/ =~ "Martin Dürst", 0) - assert_equal(/\u00FC/ =~ "testü", 4) - assert_equal(/Martin D\u{FC}rst/ =~ "fooMartin Dürstbar", 3) - assert_equal(%r{\u{FC}} =~ "fooübar", 3) + assert_equal(0, /Martin D\u00FCrst/ =~ "Martin Dürst") + assert_equal(4, /\u00FC/ =~ "testü") + assert_equal(3, /Martin D\u{FC}rst/ =~ "fooMartin Dürstbar") + assert_equal(3, %r{\u{FC}} =~ "fooübar") # Put \u in strings, literal character in regexp - assert_equal("Martin D\u00FCrst" =~ /Martin Dürst/, 0) - assert_equal("test\u00FC" =~ /ü/, 4) - assert_equal("fooMartin D\u{FC}rstbar" =~ /Martin Dürst/, 3) - assert_equal(%Q{foo\u{FC}bar} =~ %r<ü>, 3) + assert_equal(0, "Martin D\u00FCrst" =~ /Martin Dürst/) + assert_equal(4, "test\u00FC" =~ /ü/) + assert_equal(3, "fooMartin D\u{FC}rstbar" =~ /Martin Dürst/) + assert_equal(3, %Q{foo\u{FC}bar} =~ %r<ü>) + + assert_match(eval('/\u{2a}/'), "*") + assert_raise(SyntaxError) { eval('/\u{6666}/n') } + assert_raise(SyntaxError) { eval('/\u{6666}/e') } + assert_raise(SyntaxError) { eval('/\u{6666}/s') } + assert_nothing_raised { eval('/\u{6666}/u') } + end + + def test_dynamic_regexp + assert_match(Regexp.new("Martin D\\u{FC}rst"), "Martin Dürst") end def test_syntax_variants