1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* include/ruby/encoding.h (rb_enc_str_asciionly_p): declared.

(rb_enc_str_asciicompat_p): defined.

* re.c (rb_reg_initialize_str): use rb_enc_str_asciionly_p.
  (rb_reg_quote): return ascii-8bit string if the argument is
  ascii-only to generate encoding generic regexp if possible.
  (rb_reg_s_union): fix encoding handling.  [ruby-dev:32094]

* string.c (rb_enc_str_asciionly_p): defined.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14013 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2007-11-25 13:25:34 +00:00
parent 43090c9f50
commit b2e60b2ce7
5 changed files with 318 additions and 40 deletions

View file

@ -1,3 +1,15 @@
Sun Nov 25 22:21:35 2007 Tanaka Akira <akr@fsij.org>
* include/ruby/encoding.h (rb_enc_str_asciionly_p): declared.
(rb_enc_str_asciicompat_p): defined.
* re.c (rb_reg_initialize_str): use rb_enc_str_asciionly_p.
(rb_reg_quote): return ascii-8bit string if the argument is
ascii-only to generate encoding generic regexp if possible.
(rb_reg_s_union): fix encoding handling. [ruby-dev:32094]
* string.c (rb_enc_str_asciionly_p): defined.
Sun Nov 25 12:12:03 2007 Eric Hodel <drbrain@segment7.net>
* gem_prelude.rb: Import fast-loading gem_prelude.rb from RubyGems.

View file

@ -100,6 +100,8 @@ int rb_enc_tolower(int c, rb_encoding *enc);
ID rb_intern3(const char*, long, rb_encoding*);
int rb_enc_symname_p(const char*, rb_encoding*);
int rb_enc_str_coderange(VALUE);
int rb_enc_str_asciionly_p(VALUE);
#define rb_enc_str_asciicompat_p(str) rb_enc_asciicompat(rb_enc_get(str))
VALUE rb_enc_from_encoding(rb_encoding *enc);
rb_encoding *rb_enc_primary(void);
rb_encoding *rb_enc_default(void);

117
re.c
View file

@ -1268,7 +1268,7 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
static int
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
{
if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
if (!rb_enc_str_asciionly_p(str)) {
options |= ARG_ENCODING_FIXED;
}
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
@ -1654,6 +1654,7 @@ rb_reg_quote(VALUE str)
char *s, *send, *t;
VALUE tmp;
int c;
int ascii_only = rb_enc_str_asciionly_p(str);
s = RSTRING_PTR(str);
send = s + RSTRING_LEN(str);
@ -1677,11 +1678,17 @@ rb_reg_quote(VALUE str)
goto meta_found;
}
}
if (ascii_only && rb_enc_get_index(str) != 0) {
str = rb_str_new3(str);
rb_enc_associate(str, rb_enc_from_index(0));
}
return str;
meta_found:
tmp = rb_str_new(0, RSTRING_LEN(str)*2);
rb_enc_copy(tmp, str);
if (!ascii_only) {
rb_enc_copy(tmp, str);
}
t = RSTRING_PTR(tmp);
/* copy upto metacharacter */
memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
@ -1802,43 +1809,115 @@ rb_reg_s_union(VALUE self, VALUE args0)
return rb_class_new_instance(1, args, rb_cRegexp);
}
else if (argc == 1) {
VALUE v;
v = rb_check_regexp_type(rb_ary_entry(args0, 0));
if (!NIL_P(v))
return v;
VALUE arg = rb_ary_entry(args0, 0);
VALUE re = rb_check_regexp_type(arg);
if (!NIL_P(re))
return re;
else {
VALUE args[1];
args[0] = rb_reg_s_quote(Qnil, RARRAY_PTR(args0)[0]);
return rb_class_new_instance(1, args, rb_cRegexp);
VALUE quoted;
quoted = rb_reg_s_quote(Qnil, arg);
return rb_reg_new(quoted, 0);
}
}
else {
int i;
VALUE source = rb_str_buf_new(0);
int mbs = Qfalse;
rb_encoding *enc = 0;
rb_encoding *enc;
int has_asciionly_string = 0;
rb_encoding *has_ascii_compat_string = 0;
rb_encoding *has_ascii_incompat_string = 0;
int has_generic_regexp = 0;
rb_encoding *has_ascii_compat_fixed_regexp = 0;
rb_encoding *has_ascii_incompat_regexp = 0;
for (i = 0; i < argc; i++) {
volatile VALUE v;
VALUE e = rb_ary_entry(args0, i);
if (0 < i)
rb_str_buf_cat2(source, "|");
rb_str_buf_cat2(source, "|"); /* xxx: UTF-16 */
v = rb_check_regexp_type(e);
if (!NIL_P(v)) {
rb_encoding *enc0 = rb_enc_get(v);
if (!rb_enc_asciicompat(enc0)) {
if (!has_ascii_incompat_regexp) {
has_ascii_incompat_regexp = enc0;
}
else {
if (has_ascii_incompat_regexp != enc0)
rb_raise(rb_eArgError, "regexp encodings differ");
}
}
else if (ENCODING_GET(v) != 0 || FL_TEST(v, KCODE_FIXED)) {
if (!has_ascii_compat_fixed_regexp) {
has_ascii_compat_fixed_regexp = enc0;
}
else {
if (has_ascii_compat_fixed_regexp != enc0)
rb_raise(rb_eArgError, "regexp encodings differ");
}
}
else {
has_generic_regexp = 1;
}
v = rb_reg_to_s(v);
}
else {
StringValue(e);
if (!rb_enc_str_asciicompat_p(e)) {
rb_encoding *enc0 = rb_enc_get(e);
if (!has_ascii_incompat_string) {
has_ascii_incompat_string = enc0;
}
else {
if (has_ascii_incompat_string != enc0)
rb_raise(rb_eArgError, "regexp encodings differ");
}
}
else if (rb_enc_str_asciionly_p(e)) {
has_asciionly_string = 1;
}
else {
rb_encoding *enc0 = rb_enc_get(e);
if (!has_ascii_compat_string) {
has_ascii_compat_string = enc0;
}
else {
if (has_ascii_compat_string != enc0)
rb_raise(rb_eArgError, "regexp encodings differ");
}
}
v = rb_reg_s_quote(Qnil, e);
}
if (mbs || rb_enc_str_coderange(v) != ENC_CODERANGE_SINGLE) {
if (!enc) enc = rb_enc_get(v);
else if (mbs && enc != rb_enc_get(v)) {
rb_raise(rb_eArgError, "regexp encodings differ");
}
mbs = Qtrue;
}
rb_str_append(source, v);
}
if (has_ascii_incompat_string || has_ascii_incompat_regexp) {
if (has_asciionly_string || has_ascii_compat_string ||
has_generic_regexp || has_ascii_compat_fixed_regexp)
rb_raise(rb_eArgError, "regexp encodings differ");
if (has_ascii_incompat_string && has_ascii_incompat_regexp &&
has_ascii_incompat_string != has_ascii_incompat_regexp)
rb_raise(rb_eArgError, "regexp encodings differ");
enc = has_ascii_incompat_string;
if (enc == 0)
enc = has_ascii_incompat_regexp;
}
else if (has_ascii_compat_string || has_ascii_compat_fixed_regexp) {
if (has_ascii_compat_string && has_ascii_compat_fixed_regexp &&
has_ascii_compat_string != has_ascii_compat_fixed_regexp)
rb_raise(rb_eArgError, "regexp encodings differ");
enc = has_ascii_compat_string;
if (enc == 0)
enc = has_ascii_compat_fixed_regexp;
}
else {
enc = rb_enc_from_index(0);
}
rb_enc_associate(source, enc);
return rb_class_new_instance(1, &source, rb_cRegexp);
}
}

View file

@ -129,6 +129,23 @@ rb_enc_str_coderange(VALUE str)
return cr;
}
int rb_enc_str_asciionly_p(VALUE str)
{
rb_encoding *enc = rb_enc_get(str);
if (rb_enc_asciicompat(enc) &&
rb_enc_str_coderange(str) == ENC_CODERANGE_SINGLE) {
char *ptr = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
long i;
for (i = 0; i < len; i++)
if (ptr[i] & 0x80)
return Qfalse;
return Qtrue;
}
return Qfalse;
}
static inline void
str_mod_check(VALUE s, char *p, long len)
{

View file

@ -46,30 +46,71 @@ class TestM17N < Test::Unit::TestCase
#assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
end
def assert_regexp_generic_encoding(r)
%w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename|
# "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(ename) }
}
end
def assert_regexp_fixed_encoding(r)
%w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename|
enc = Encoding.find(ename)
if enc == r.encoding
assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(enc) }
else
assert_raise(ArgumentError) { r =~ "\xc0\xa1".force_encoding(enc) }
end
}
end
def assert_regexp_generic_ascii(r)
assert_encoding("ASCII-8BIT", r.encoding)
assert_regexp_generic_encoding(r)
end
def assert_regexp_fixed_ascii8bit(r)
assert_encoding("ASCII-8BIT", r.encoding)
assert_regexp_fixed_encoding(r)
end
def assert_regexp_fixed_eucjp(r)
assert_encoding("EUC-JP", r.encoding)
assert_regexp_fixed_encoding(r)
end
def assert_regexp_fixed_sjis(r)
assert_encoding("Shift_JIS", r.encoding)
assert_regexp_fixed_encoding(r)
end
def assert_regexp_fixed_utf8(r)
assert_encoding("UTF-8", r.encoding)
assert_regexp_fixed_encoding(r)
end
def test_regexp_generic
r = /a/
assert_encoding("ASCII-8BIT", r.encoding)
assert_regexp_generic_ascii(r)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
# "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
assert_equal(nil, r =~ a("\xc0\xa1"))
assert_equal(nil, r =~ e("\xc0\xa1"))
assert_equal(nil, r =~ s("\xc0\xa1"))
assert_equal(nil, r =~ u("\xc0\xa1"))
r = eval(a(%{/\xc0\xa1/}))
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = Regexp.new("a".force_encoding("ASCII-8BIT"))
assert_regexp_generic_ascii(r)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
assert_equal(nil, r =~ a("\xc0\xa1"))
assert_equal(nil, r =~ e("\xc0\xa1"))
assert_equal(nil, r =~ s("\xc0\xa1"))
assert_equal(nil, r =~ u("\xc0\xa1"))
# xxx: /\xc0\xa1/ should be restricted only for ASCII-8BIT?
# r = /\xc0\xa1/
@ -86,7 +127,7 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_ascii
r = /a/n
assert_encoding("ASCII-8BIT", r.encoding)
assert_regexp_fixed_ascii8bit(r)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
@ -97,7 +138,18 @@ class TestM17N < Test::Unit::TestCase
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = /\xc0\xa1/n
assert_encoding("ASCII-8BIT", r.encoding)
assert_regexp_fixed_ascii8bit(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(a(%{/\xc0\xa1/}))
assert_regexp_fixed_ascii8bit(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
@ -108,7 +160,7 @@ class TestM17N < Test::Unit::TestCase
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%{/\xc0\xa1/n}.force_encoding("ASCII-8BIT"))
assert_encoding("ASCII-8BIT", r.encoding)
assert_regexp_fixed_ascii8bit(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
@ -119,7 +171,9 @@ class TestM17N < Test::Unit::TestCase
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%q{/\xc0\xa1/}.force_encoding("ASCII-8BIT"))
# assert_regexp_fixed_ascii8bit(r)
assert_encoding("ASCII-8BIT", r.encoding)
# assert_regexp_fixed_encoding(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
@ -128,12 +182,22 @@ class TestM17N < Test::Unit::TestCase
# assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
# assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
# assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
end
def test_regexp_euc
r = /a/e
assert_encoding("EUC-JP", r.encoding)
assert_regexp_fixed_eucjp(r)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
assert_equal(nil, r =~ e("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = Regexp.new("a".force_encoding("EUC-JP"))
assert_regexp_fixed_eucjp(r)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
@ -144,7 +208,7 @@ class TestM17N < Test::Unit::TestCase
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = /\xc0\xa1/e
assert_encoding("EUC-JP", r.encoding)
assert_regexp_fixed_eucjp(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
@ -155,7 +219,7 @@ class TestM17N < Test::Unit::TestCase
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%{/\xc0\xa1/}.force_encoding("EUC-JP"))
assert_encoding("EUC-JP", r.encoding)
assert_regexp_fixed_eucjp(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
@ -166,7 +230,7 @@ class TestM17N < Test::Unit::TestCase
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%q{/\xc0\xa1/}.force_encoding("EUC-JP"))
assert_encoding("EUC-JP", r.encoding)
assert_regexp_fixed_eucjp(r)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
@ -175,6 +239,7 @@ class TestM17N < Test::Unit::TestCase
assert_equal(0, r =~ e("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
end
def test_begin_end_offset
@ -198,4 +263,107 @@ class TestM17N < Test::Unit::TestCase
assert_equal([1,2], $~.offset(0))
end
def test_quote
assert_regexp_generic_ascii(/#{Regexp.quote(a("a"))}#{Regexp.quote(e("e"))}/)
# Regexp.quote returns ASCII-8BIT string for ASCII only string
# to make generic regexp if possible.
assert_encoding("ASCII-8BIT", Regexp.quote(a("")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(e("")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(s("")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(u("")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(a("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(e("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc0\xa1")).encoding)
assert_encoding("EUC-JP", Regexp.quote(e("\xc0\xa1")).encoding)
assert_encoding("Shift_JIS", Regexp.quote(s("\xc0\xa1")).encoding)
assert_encoding("UTF-8", Regexp.quote(u("\xc0\xa1")).encoding)
end
def test_union_0
r = Regexp.union
assert_regexp_generic_ascii(r)
assert(r !~ a(""))
assert(r !~ e(""))
assert(r !~ s(""))
assert(r !~ u(""))
end
def test_union_1_asciionly_string
assert_regexp_generic_ascii(Regexp.union(a("")))
assert_regexp_generic_ascii(Regexp.union(e("")))
assert_regexp_generic_ascii(Regexp.union(s("")))
assert_regexp_generic_ascii(Regexp.union(u("")))
assert_regexp_generic_ascii(Regexp.union(a("a")))
assert_regexp_generic_ascii(Regexp.union(e("a")))
assert_regexp_generic_ascii(Regexp.union(s("a")))
assert_regexp_generic_ascii(Regexp.union(u("a")))
assert_regexp_generic_ascii(Regexp.union(a("\t")))
assert_regexp_generic_ascii(Regexp.union(e("\t")))
assert_regexp_generic_ascii(Regexp.union(s("\t")))
assert_regexp_generic_ascii(Regexp.union(u("\t")))
end
def test_union_1_nonascii_string
assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc0\xa1")))
assert_regexp_fixed_eucjp(Regexp.union(e("\xc0\xa1")))
assert_regexp_fixed_sjis(Regexp.union(s("\xc0\xa1")))
assert_regexp_fixed_utf8(Regexp.union(u("\xc0\xa1")))
end
def test_union_1_regexp
assert_regexp_generic_ascii(Regexp.union(//))
assert_regexp_fixed_ascii8bit(Regexp.union(//n))
assert_regexp_fixed_eucjp(Regexp.union(//e))
assert_regexp_fixed_sjis(Regexp.union(//s))
assert_regexp_fixed_utf8(Regexp.union(//u))
end
def test_union_2_asciionly_strings
ary = [a(""), e(""), s(""), u("")]
ary.each {|s1|
ary.each {|s2|
assert_regexp_generic_ascii(Regexp.union(s1, s2))
}
}
end
def test_union_2_strings
ary = [
a(""), e(""), s(""), u(""),
a("\xc0\xa1"), e("\xc0\xa1"), s("\xc0\xa1"), u("\xc0\xa1")
]
ary.each {|s1|
ary.each {|s2|
if s1.empty?
if s2.empty?
assert_regexp_generic_ascii(Regexp.union(s1, s2))
else
r = Regexp.union(s1, s2)
assert_regexp_fixed_encoding(r)
assert_equal(s2.encoding, r.encoding)
end
else
if s2.empty?
r = Regexp.union(s1, s2)
assert_regexp_fixed_encoding(r)
assert_equal(s1.encoding, r.encoding)
else
if s1.encoding == s2.encoding
r = Regexp.union(s1, s2)
assert_regexp_fixed_encoding(r)
assert_equal(s1.encoding, r.encoding)
else
assert_raise(ArgumentError) { Regexp.union(s1, s2) }
end
end
end
}
}
end
end