From 94a0db11e7f286b9d8432814dcc8b857ce129481 Mon Sep 17 00:00:00 2001 From: nobu Date: Wed, 26 Sep 2007 09:39:08 +0000 Subject: [PATCH] * encoding.c (rb_enc_check): check for ASCII-compatibilities. * parse.y (parser_tokadd_string, parser_parse_string, parser_here_document, parser_yylex): set encoding to US-ASCII. * parse.y (rb_enc_symname_p): check if valid with encoding. * parse.y (rb_intern3): let symbols have encoding. * string.c (rb_str_hash): add encoding index. * string.c (rb_str_comparable, rb_str_equal, rb_str_eql): check if compatible encoding. * string.c (sym_inspect): made encoding aware. * insns.def (opt_eq): compare with encoding. * include/ruby/encoding.h (rb_enc_asciicompat): check if ASCII compatible. * include/ruby/encoding.h (rb_enc_get_index): added prototype. * include/ruby/intern.h (rb_str_comparable, rb_str_equal): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@13518 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 27 +++++++++++++ encoding.c | 8 +--- include/ruby/encoding.h | 4 ++ include/ruby/intern.h | 2 + insns.def | 5 ++- parse.y | 86 ++++++++++++++++++++++------------------- string.c | 42 +++++++++++++++++--- vm.c | 1 + 8 files changed, 122 insertions(+), 53 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9a722c5950..6813f34ed2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,30 @@ +Wed Sep 26 18:38:41 2007 Nobuyoshi Nakada + + * encoding.c (rb_enc_check): check for ASCII-compatibilities. + + * parse.y (parser_tokadd_string, parser_parse_string, + parser_here_document, parser_yylex): set encoding to US-ASCII. + + * parse.y (rb_enc_symname_p): check if valid with encoding. + + * parse.y (rb_intern3): let symbols have encoding. + + * string.c (rb_str_hash): add encoding index. + + * string.c (rb_str_comparable, rb_str_equal, rb_str_eql): check if + compatible encoding. + + * string.c (sym_inspect): made encoding aware. + + * insns.def (opt_eq): compare with encoding. + + * include/ruby/encoding.h (rb_enc_asciicompat): check if ASCII + compatible. + + * include/ruby/encoding.h (rb_enc_get_index): added prototype. + + * include/ruby/intern.h (rb_str_comparable, rb_str_equal): ditto. + Wed Sep 26 15:01:16 2007 Nobuyoshi Nakada * eval_method.ci (rb_get_alloc_func): cast to suppress a warning. diff --git a/encoding.c b/encoding.c index b1c8ba9349..a6ee890ce7 100644 --- a/encoding.c +++ b/encoding.c @@ -194,19 +194,15 @@ rb_enc_check(VALUE str1, VALUE str2) if (idx1 == 0) { enc = rb_enc_from_index(idx2); -#if 0 - if (m17n_asciicompat(enc)) { + if (rb_enc_asciicompat(enc)) { return enc; } -#endif } else if (idx2 == 0) { enc = rb_enc_from_index(idx1); -#if 0 - if (m17n_asciicompat(enc)) { + if (rb_enc_asciicompat(enc)) { return enc; } -#endif } rb_raise(rb_eArgError, "character encodings differ"); } diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index f43160a682..1b97b6ab64 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -27,6 +27,7 @@ typedef OnigEncodingType rb_encoding; int rb_enc_to_index(rb_encoding*); +int rb_enc_get_index(VALUE obj); rb_encoding* rb_enc_get(VALUE); rb_encoding* rb_enc_check(VALUE,VALUE); void rb_enc_associate(VALUE, rb_encoding*); @@ -73,8 +74,11 @@ int rb_enc_codelen(int, rb_encoding*); #define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE(enc,c) #define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT(enc,c) +#define rb_enc_asciicompat(enc) (rb_enc_mbminlen(enc)==1) + int rb_enc_toupper(int c, rb_encoding *enc); int rb_enc_tolower(int c, rb_encoding *enc); ID rb_intern3(const char*, long, rb_encoding*); +int rb_enc_symname_p(const char*, rb_encoding*); #endif /* RUBY_ENCODING_H */ diff --git a/include/ruby/intern.h b/include/ruby/intern.h index 7391845e2d..e06e36892f 100644 --- a/include/ruby/intern.h +++ b/include/ruby/intern.h @@ -516,7 +516,9 @@ VALUE rb_str_append(VALUE, VALUE); VALUE rb_str_concat(VALUE, VALUE); int rb_memhash(const void *ptr, long len); int rb_str_hash(VALUE); +int rb_str_comparable(VALUE, VALUE); int rb_str_cmp(VALUE, VALUE); +VALUE rb_str_equal(VALUE str1, VALUE str2); void rb_str_update(VALUE, long, long, VALUE); VALUE rb_str_inspect(VALUE); VALUE rb_str_dump(VALUE); diff --git a/insns.def b/insns.def index 1d9ef3d6f2..d31a9656bc 100644 --- a/insns.def +++ b/insns.def @@ -1700,13 +1700,14 @@ opt_eq if (str1 == str2) { val = Qtrue; } - else if (RSTRING_LEN(str1) == RSTRING_LEN(str2) && + else if (!ENCODING_GET(str1) && !ENCODING_GET(str2) && + RSTRING_LEN(str1) == RSTRING_LEN(str2) && rb_memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), RSTRING_LEN(str1)) == 0) { val = Qtrue; } else { - val = Qfalse; + val = rb_str_equal(str1, str2); } } else { diff --git a/parse.y b/parse.y index 1b28727bea..8db9ce33cc 100644 --- a/parse.y +++ b/parse.y @@ -262,6 +262,8 @@ struct parser_params { #define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc) #define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc) +#define STR_NEW3(p,n,m) rb_enc_str_new((p),(n), STR_ENC(m)) +#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0)) #ifdef YYMALLOC void *rb_parser_malloc(struct parser_params *, size_t); @@ -3886,7 +3888,7 @@ dsym : tSYMBEG xstring_contents tSTRING_END yyerror("empty symbol literal"); break; } - $$->nd_lit = ID2SYM(rb_intern2(RSTRING_PTR(lit), RSTRING_LEN(lit))); + $$->nd_lit = ID2SYM(rb_intern_str(lit)); nd_set_type($$, NODE_LIT); break; default: @@ -4478,7 +4480,7 @@ none : /* none */ # define yylval (*((YYSTYPE*)(parser->parser_yylval))) static int parser_regx_options(struct parser_params*); -static int parser_tokadd_string(struct parser_params*,int,int,int,long*); +static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*); static int parser_parse_string(struct parser_params*,NODE*); static int parser_here_document(struct parser_params*,NODE*); @@ -4489,7 +4491,7 @@ static int parser_here_document(struct parser_params*,NODE*); # define read_escape() parser_read_escape(parser) # define tokadd_escape(t) parser_tokadd_escape(parser, t) # define regx_options() parser_regx_options(parser) -# define tokadd_string(f,t,p,n) parser_tokadd_string(parser,f,t,p,n) +# define tokadd_string(f,t,p,n,m) parser_tokadd_string(parser,f,t,p,n,m) # define parse_string(n) parser_parse_string(parser,n) # define here_document(n) parser_here_document(parser,n) # define heredoc_identifier() parser_heredoc_identifier(parser) @@ -5150,15 +5152,24 @@ dispose_string(VALUE str) rb_gc_force_recycle(str); } +static void +parser_tokadd_mbchar(struct parser_params *parser, int c) +{ + int len = parser_mbclen(); + do { + tokadd(c); + } while (--len > 0 && (c = nextc()) != -1); +} + +#define tokadd_mbchar(c) parser_tokadd_mbchar(parser, c) + static int parser_tokadd_string(struct parser_params *parser, - int func, int term, int paren, long *nest) + int func, int term, int paren, long *nest, int *mb) { int c; - unsigned char uc; while ((c = nextc()) != -1) { - uc = (unsigned char)c; if (paren && c == paren) { ++*nest; } @@ -5210,12 +5221,9 @@ parser_tokadd_string(struct parser_params *parser, } } else if (parser_ismbchar()) { - int i, len = parser_mbclen()-1; - - for (i = 0; i < len; i++) { - tokadd(c); - c = nextc(); - } + tokadd_mbchar(c); + if (mb) *mb = 1; + continue; } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { pushback(c); @@ -5240,7 +5248,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) int func = quote->nd_func; int term = nd_term(quote); int paren = nd_paren(quote); - int c, space = 0; + int c, space = 0, mb = 0; if (func == -1) return tSTRING_END; c = nextc(); @@ -5274,7 +5282,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) tokadd('#'); } pushback(c); - if (tokadd_string(func, term, paren, "e->nd_nest) == -1) { + if (tokadd_string(func, term, paren, "e->nd_nest, &mb) == -1) { if (func & STR_FUNC_REGEXP) { ruby_sourceline = nd_line(quote); compile_error(PARSER_ARG "unterminated regexp meets end of file"); @@ -5288,7 +5296,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) } tokfix(); - set_yylval_str(STR_NEW(tok(), toklen())); + set_yylval_str(STR_NEW3(tok(), toklen(), mb)); return tSTRING_CONTENT; } @@ -5451,6 +5459,7 @@ parser_here_document(struct parser_params *parser, NODE *here) } while (!whole_match_p(eos, len, indent)); } else { + int mb = 0; newtok(); if (c == '#') { switch (c = nextc()) { @@ -5465,15 +5474,15 @@ parser_here_document(struct parser_params *parser, NODE *here) } do { pushback(c); - if ((c = tokadd_string(func, '\n', 0, NULL)) == -1) goto error; + if ((c = tokadd_string(func, '\n', 0, NULL, &mb)) == -1) goto error; if (c != '\n') { - set_yylval_str(STR_NEW(tok(), toklen())); + set_yylval_str(STR_NEW3(tok(), toklen(), mb)); return tSTRING_CONTENT; } tokadd(nextc()); if ((c = nextc()) == -1) goto error; } while (!whole_match_p(eos, len, indent)); - str = STR_NEW(tok(), toklen()); + str = STR_NEW3(tok(), toklen(), mb); } heredoc_restore(lex_strterm); lex_strterm = NEW_STRTERM(-1, 0, 0); @@ -5687,6 +5696,7 @@ parser_yylex(struct parser_params *parser) int space_seen = 0; int cmd_state; enum lex_state_e last_state; + int mb; #ifdef RIPPER int fallthru = Qfalse; #endif @@ -6005,13 +6015,7 @@ parser_yylex(struct parser_params *parser) } newtok(); if (parser_ismbchar()) { - int i, len = parser_mbclen()-1; - - tokadd(c); - for (i = 0; i < len; i++) { - c = nextc(); - tokadd(c); - } + tokadd_mbchar(c); } else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) { @@ -6696,7 +6700,7 @@ parser_yylex(struct parser_params *parser) tokadd(c); c = nextc(); if (parser_is_identchar()) { - tokadd(c); + tokadd_mbchar(c); } else { pushback(c); @@ -6794,15 +6798,10 @@ parser_yylex(struct parser_params *parser) break; } + mb = 0; do { - int i, len; - tokadd(c); - - len = parser_mbclen()-1; - for (i = 0; i < len; i++) { - c = nextc(); - tokadd(c); - } + if (!ISASCII(c)) mb = 1; + tokadd_mbchar(c); c = nextc(); } while (parser_is_identchar()); if ((c == '!' || c == '?') && !peek('=')) { @@ -6854,7 +6853,7 @@ parser_yylex(struct parser_params *parser) } } - if (lex_state != EXPR_DOT) { + if (!mb && lex_state != EXPR_DOT) { const struct kwtable *kw; /* See if it is a reserved word. */ @@ -6896,7 +6895,7 @@ parser_yylex(struct parser_params *parser) if (peek(':') && !(lex_p + 1 < lex_pend && lex_p[1] == ':')) { lex_state = EXPR_BEG; nextc(); - set_yylval_id(rb_intern(tok())); + set_yylval_id(rb_intern3(tok(), toklen(), STR_ENC(mb))); return tLABEL; } } @@ -6915,7 +6914,7 @@ parser_yylex(struct parser_params *parser) } } { - ID ident = rb_intern(tok()); + ID ident = rb_intern3(tok(), toklen(), STR_ENC(mb)); set_yylval_id(ident); if (last_state != EXPR_DOT && is_local_id(ident) && lvar_defined(ident)) { @@ -8370,11 +8369,16 @@ is_special_global_name(const char *m, const char *e, rb_encoding *enc) int rb_symname_p(const char *name) +{ + return rb_enc_symname_p(name, rb_enc_from_index(0)); +} + +int +rb_enc_symname_p(const char *name, rb_encoding *enc) { const char *m = name; const char *e = m + strlen(m); int localid = Qfalse; - rb_encoding *enc = rb_enc_from_index(0); if (!m) return Qfalse; switch (*m) { @@ -8458,8 +8462,10 @@ rb_intern3(const char *name, long len, rb_encoding *enc) fake_str.as.heap.len = len; fake_str.as.heap.ptr = (char *)name; fake_str.as.heap.aux.capa = len; + str = (VALUE)&fake_str; + rb_enc_associate(str, enc); - if (st_lookup(global_symbols.sym_id, (st_data_t)&fake_str, (st_data_t *)&id)) + if (st_lookup(global_symbols.sym_id, str, (st_data_t *)&id)) return id; last = len-1; @@ -8520,7 +8526,7 @@ rb_intern3(const char *name, long len, rb_encoding *enc) new_id: id |= ++global_symbols.last_id << ID_SCOPE_SHIFT; id_register: - str = rb_str_new(name, len); + str = rb_enc_str_new(name, len, enc); OBJ_FREEZE(str); st_add_direct(global_symbols.sym_id, (st_data_t)str, id); st_add_direct(global_symbols.id_str, id, (st_data_t)str); diff --git a/string.c b/string.c index 1cad0f8bf5..eacaa35acf 100644 --- a/string.c +++ b/string.c @@ -1129,7 +1129,8 @@ rb_memhash(const void *ptr, long len) int rb_str_hash(VALUE str) { - return rb_memhash(RSTRING_PTR(str), RSTRING_LEN(str)); + return hash((const void *)RSTRING_PTR(str), RSTRING_LEN(str), + rb_enc_get_index(str)); } /* @@ -1148,6 +1149,32 @@ rb_str_hash_m(VALUE str) #define lesser(a,b) (((a)>(b))?(b):(a)) +static int +is_ascii_string(VALUE str) +{ + long i; + + for (i = 0; i < RSTRING_LEN(str); ++i) { + int c = (unsigned char)RSTRING_PTR(str)[i]; + if (!ISASCII(c)) return Qfalse; + } + return Qtrue; +} + +int +rb_str_comparable(VALUE str1, VALUE str2) +{ + int idx1 = rb_enc_get_index(str1); + int idx2 = rb_enc_get_index(str2); + + if (idx1 == idx2) return Qtrue; + if (!rb_enc_asciicompat(rb_enc_from_index(idx1))) return Qfalse; + if (!rb_enc_asciicompat(rb_enc_from_index(idx2))) return Qfalse; + if (!is_ascii_string(str1)) return Qfalse; + if (!is_ascii_string(str2)) return Qfalse; + return Qtrue; +} + int rb_str_cmp(VALUE str1, VALUE str2) { @@ -1176,7 +1203,7 @@ rb_str_cmp(VALUE str1, VALUE str2) * <=> obj returns zero. */ -static VALUE +VALUE rb_str_equal(VALUE str1, VALUE str2) { if (str1 == str2) return Qtrue; @@ -1186,7 +1213,7 @@ rb_str_equal(VALUE str1, VALUE str2) } return rb_equal(str2, str1); } - rb_enc_check(str1, str2); /* need weak check */ + if (!rb_str_comparable(str1, str2)) return Qfalse; if (RSTRING_LEN(str1) == RSTRING_LEN(str2) && rb_str_cmp(str1, str2) == 0) { return Qtrue; @@ -1207,6 +1234,9 @@ rb_str_eql(VALUE str1, VALUE str2) if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2)) return Qfalse; + if (rb_enc_get_index(str1) != rb_enc_get_index(str2)) + return Qfalse; + if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0) return Qtrue; @@ -5126,13 +5156,15 @@ sym_inspect(VALUE sym) { VALUE str, klass = Qundef; ID id = SYM2ID(sym); + rb_encoding *enc; sym = rb_id2str(id); - str = rb_str_new(0, RSTRING_LEN(sym)+1); + enc = rb_enc_get(sym); + str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc); RSTRING_PTR(str)[0] = ':'; memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym)); if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) || - !rb_symname_p(RSTRING_PTR(sym))) { + !rb_enc_symname_p(RSTRING_PTR(sym), enc)) { str = rb_str_dump(str); strncpy(RSTRING_PTR(str), ":\"", 2); } diff --git a/vm.c b/vm.c index e09497dd49..f2c96eaeb0 100644 --- a/vm.c +++ b/vm.c @@ -12,6 +12,7 @@ #include "ruby/ruby.h" #include "ruby/node.h" #include "ruby/st.h" +#include "ruby/encoding.h" #include "gc.h" #include "insnhelper.h"