From c7697aba346c6928164ef53151f3e1090a1176e8 Mon Sep 17 00:00:00 2001 From: nobu Date: Fri, 19 Oct 2007 07:41:03 +0000 Subject: [PATCH] * parse.y (parser_regx_options, reg_compile_gen): relaxened encoding matching rule. * re.c (rb_reg_initialize): always set encoding of Regexp. * re.c (rb_reg_initialize_str): fix enconding for non 7bit-clean strings. * re.c (rb_reg_initialize_m): use ascii encoding for 'n' option. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@13743 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 12 ++++++++++++ parse.y | 40 ++++++++++++++++++++++++++------------- re.c | 56 +++++++++++++++++++++++++++++++------------------------ 3 files changed, 71 insertions(+), 37 deletions(-) diff --git a/ChangeLog b/ChangeLog index 53091f0c2a..8f2ea9cb4d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +Fri Oct 19 16:41:00 2007 Nobuyoshi Nakada + + * parse.y (parser_regx_options, reg_compile_gen): relaxened encoding + matching rule. + + * re.c (rb_reg_initialize): always set encoding of Regexp. + + * re.c (rb_reg_initialize_str): fix enconding for non 7bit-clean + strings. + + * re.c (rb_reg_initialize_m): use ascii encoding for 'n' option. + Fri Oct 19 11:09:56 2007 Nobuyoshi Nakada * ruby.c (process_options): set primary encoding from the parser diff --git a/parse.y b/parse.y index a6e8637f8e..8653fed1d8 100644 --- a/parse.y +++ b/parse.y @@ -261,7 +261,7 @@ struct parser_params { }; #define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc) -#define STR_NEW0() rb_enc_str_new(0,0,rb_enc_from_index(0)) +#define STR_NEW0() rb_str_new(0,0) #define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc) #define STR_NEW3(p,n,m) parser_str_new((p),(n),STR_ENC(!ENC_SINGLE(m)),(m)) #define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0)) @@ -443,6 +443,10 @@ static int lvar_defined_gen(struct parser_params*, ID); #define lvar_defined(id) lvar_defined_gen(parser, id) #define RE_OPTION_ONCE (1<<16) +#define RE_OPTION_ENCODING_SHIFT 8 +#define RE_OPTION_ENCODING(e) (((e)&0xff)<>RE_OPTION_ENCODING_SHIFT)&0xff) +#define RE_OPTION_MASK 0xff #define NODE_STRTERM NODE_ZARRAY /* nothing to gc */ #define NODE_HEREDOC NODE_ARRAY /* 1, 3 to gc */ @@ -3639,14 +3643,14 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END int options = $3; NODE *node = $2; if (!node) { - node = NEW_LIT(reg_compile(0, options & ~RE_OPTION_ONCE)); + node = NEW_LIT(reg_compile(STR_NEW0(), options)); } else switch (nd_type(node)) { case NODE_STR: { VALUE src = node->nd_lit; nd_set_type(node, NODE_LIT); - node->nd_lit = reg_compile(src, options&~RE_OPTION_ONCE); + node->nd_lit = reg_compile(src, options); } break; default: @@ -3658,7 +3662,7 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END else { nd_set_type(node, NODE_DREGX); } - node->nd_cflag = options & ~RE_OPTION_ONCE; + node->nd_cflag = options & RE_OPTION_MASK; break; } $$ = node; @@ -5110,11 +5114,12 @@ parser_tokadd_escape(struct parser_params *parser, int term, int *mb) return 0; } +extern int rb_char_to_option_kcode(int c, int *option, int *kcode); + static int parser_regx_options(struct parser_params *parser) { - extern int rb_char_to_option_kcode(int c, int *option, int *kcode); - + int kcode = 0; int options = 0; int c, opt, kc; @@ -5125,11 +5130,7 @@ parser_regx_options(struct parser_params *parser) } else if (rb_char_to_option_kcode(c, &opt, &kc)) { options |= opt; - if (kc != 0 && rb_enc_from_index(kc) != parser->enc) { - compile_error(PARSER_ARG - "regexp encoding option '%c' mismatch to %s", - c, rb_enc_name(parser->enc)); - } + if (kc >= 0) kcode = c; } else { tokadd(c); @@ -5141,7 +5142,7 @@ parser_regx_options(struct parser_params *parser) compile_error(PARSER_ARG "unknown regexp option%s - %s", toklen() > 1 ? "s" : "", tok()); } - return options; + return options | RE_OPTION_ENCODING(kcode); } #define STR_FUNC_ESCAPE 0x01 @@ -8212,8 +8213,21 @@ VALUE rb_reg_compile(VALUE str, int options); static VALUE reg_compile_gen(struct parser_params* parser, VALUE str, int options) { - VALUE re = rb_reg_compile(str, (options) & ~RE_OPTION_ONCE); + VALUE re; + int c = RE_OPTION_ENCODING_IDX(options); + if (c) { + int opt, idx; + rb_char_to_option_kcode(c, &opt, &idx); + if (idx != ENCODING_GET(str) && ENCODING_GET(str) && + rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) { + compile_error(PARSER_ARG + "regexp encoding option '%c' differs from source encoding '%s'", + c, rb_enc_name(rb_enc_get(str))); + } + ENCODING_SET(str, idx); + } + re = rb_reg_compile(str, options & RE_OPTION_MASK); if (NIL_P(re)) { RB_GC_GUARD(re) = rb_obj_as_string(rb_errinfo()); compile_error(PARSER_ARG "%s", RSTRING_PTR(re)); diff --git a/re.c b/re.c index 9501595bfb..8acf438cb2 100644 --- a/re.c +++ b/re.c @@ -136,8 +136,11 @@ rb_memsearch(const void *x0, long m, const void *y0, long n) #define KCODE_FIXED FL_USER4 -#define ARG_REG_OPTION_MASK 0x0f -#define ARG_KCODE_NONE 0x10 +#define ARG_REG_OPTION_MASK \ + (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND) +#define ARG_ENCODING_FIXED 16 + +#define ARG_KCODE_NONE 0 #define ARG_KCODE_EUC 1 #define ARG_KCODE_SJIS 2 #define ARG_KCODE_UTF8 3 @@ -157,9 +160,6 @@ char_to_option(int c) case 'm': val = ONIG_OPTION_MULTILINE; break; - case 'n': - val = ARG_KCODE_NONE; - break; default: val = 0; break; @@ -184,19 +184,24 @@ rb_char_to_option_kcode(int c, int *option, int *kcode) *option = 0; switch (c) { + case 'n': + *kcode = ARG_KCODE_NONE; + break; case 'e': *kcode = ARG_KCODE_EUC; - return 1; + break; case 's': *kcode = ARG_KCODE_SJIS; - return 1; + break; case 'u': *kcode = ARG_KCODE_UTF8; - return 1; + break; default: - *kcode = 0; + *kcode = -1; return (*option = char_to_option(c)); } + *option = ARG_ENCODING_FIXED; + return 1; } static void @@ -1227,14 +1232,10 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, re->ptr = 0; re->str = 0; - if (options & ARG_KCODE_NONE) { - rb_enc_associate_index((VALUE)re, 0); - enc = rb_enc_from_index(0); + rb_enc_associate((VALUE)re, enc); + if (options & ARG_ENCODING_FIXED) { re->basic.flags |= KCODE_FIXED; } - else { - rb_enc_associate((VALUE)re, enc); - } re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err); if (!re->ptr) return -1; re->str = ALLOC_N(char, len+1); @@ -1247,6 +1248,9 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, static int rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err) { + if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) { + options |= ARG_ENCODING_FIXED; + } return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str), options, err); } @@ -1573,21 +1577,21 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) onig_errmsg_buffer err; int flags = 0; VALUE str; + rb_encoding *enc; + const char *ptr; + long len; if (argc == 0 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments"); } if (TYPE(argv[0]) == T_REGEXP) { VALUE re = argv[0]; - const char *ptr; - long len; - rb_encoding *enc; if (argc > 1) { rb_warn("flags ignored"); } rb_reg_check(re); - flags = RREGEXP(argv[0])->ptr->options & ARG_REG_OPTION_MASK; + flags = rb_reg_options(re); ptr = RREGEXP(re)->str; len = RREGEXP(re)->len; enc = rb_enc_get(re); @@ -1601,18 +1605,22 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]); else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE; } + enc = 0; if (argc == 3 && !NIL_P(argv[2])) { char *kcode = StringValuePtr(argv[2]); if (kcode[0] == 'n' || kcode[1] == 'N') { - flags |= ARG_KCODE_NONE; + enc = rb_enc_from_index(0); + flags |= ARG_ENCODING_FIXED; } else { rb_warning("encoding option is obsolete - %s", kcode); } } str = argv[0]; - StringValueCStr(str); - if (rb_reg_initialize_str(self, str, flags, err)) { + ptr = StringValueCStr(str); + if (enc + ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err) + : rb_reg_initialize_str(self, str, flags, err)) { rb_reg_raise_str(str, flags, err); } } @@ -1731,8 +1739,8 @@ rb_reg_options(VALUE re) int options; rb_reg_check(re); - options = RREGEXP(re)->ptr->options & - (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND); + options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK; + if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED; return options; }