1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* encoding.c (rb_enc_mbclen): make it never fail.

(rb_enc_nth): don't check the return value of rb_enc_mbclen.
  (rb_enc_strlen): ditto.
  (rb_enc_precise_mbclen): return needmore(1) if e <= p.
  (rb_enc_get_ascii): new function for extracting ASCII character.

* include/ruby/encoding.h (rb_enc_get_ascii): declared.

* include/ruby/regex.h (ismbchar): removed.

* re.c (rb_reg_expr_str): use rb_enc_get_ascii.
  (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine
  the termination of escaped non-ASCII character.
  (unescape_nonascii): use rb_enc_precise_mbclen.
  (rb_reg_quote): use rb_enc_get_ascii.
  (rb_reg_regsub): use rb_enc_get_ascii.

* string.c (rb_str_reverse) don't check the return value of
  rb_enc_mbclen.
  (rb_str_split_m): don't call rb_enc_mbclen with e <= p.

* parse.y (is_identchar): use ISASCII.
  (parser_ismbchar): removed.
  (parser_precise_mbclen): new macro.
  (parser_isascii): new macro.
  (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid
  character precisely.
  (parser_tokadd_string): use parser_isascii.
  (parser_yylex): ditto.
  (is_special_global_name): don't call is_identchar with e <= p.
  (rb_enc_symname_p): ditto.

  [ruby-dev:32455]

* ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie
  because the encoding is not UTF-8.  [ruby-dev:32475]


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14131 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2007-12-08 02:50:43 +00:00
parent 990bec9702
commit f1b7e60cb9
10 changed files with 119 additions and 53 deletions

View file

@ -1,3 +1,42 @@
Sat Dec 8 11:06:29 2007 Tanaka Akira <akr@fsij.org>
* encoding.c (rb_enc_mbclen): make it never fail.
(rb_enc_nth): don't check the return value of rb_enc_mbclen.
(rb_enc_strlen): ditto.
(rb_enc_precise_mbclen): return needmore(1) if e <= p.
(rb_enc_get_ascii): new function for extracting ASCII character.
* include/ruby/encoding.h (rb_enc_get_ascii): declared.
* include/ruby/regex.h (ismbchar): removed.
* re.c (rb_reg_expr_str): use rb_enc_get_ascii.
(unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine
the termination of escaped non-ASCII character.
(unescape_nonascii): use rb_enc_precise_mbclen.
(rb_reg_quote): use rb_enc_get_ascii.
(rb_reg_regsub): use rb_enc_get_ascii.
* string.c (rb_str_reverse) don't check the return value of
rb_enc_mbclen.
(rb_str_split_m): don't call rb_enc_mbclen with e <= p.
* parse.y (is_identchar): use ISASCII.
(parser_ismbchar): removed.
(parser_precise_mbclen): new macro.
(parser_isascii): new macro.
(parser_tokadd_mbchar): use parser_precise_mbclen to check invalid
character precisely.
(parser_tokadd_string): use parser_isascii.
(parser_yylex): ditto.
(is_special_global_name): don't call is_identchar with e <= p.
(rb_enc_symname_p): ditto.
[ruby-dev:32455]
* ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie
because the encoding is not UTF-8. [ruby-dev:32475]
Fri Dec 7 20:21:35 2007 GOTOU Yuuzou <gotoyuzo@notwork.org> Fri Dec 7 20:21:35 2007 GOTOU Yuuzou <gotoyuzo@notwork.org>
* ext/openssl/lib/net/ftptls.rb, ext/openssl/lib/net/telnets.rb: * ext/openssl/lib/net/ftptls.rb, ext/openssl/lib/net/telnets.rb:

View file

@ -459,7 +459,6 @@ rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
for (c=0; p<e && nth--; c++) { for (c=0; p<e && nth--; c++) {
int n = rb_enc_mbclen(p, e, enc); int n = rb_enc_mbclen(p, e, enc);
if (n == 0) return 0;
p += n; p += n;
} }
} }
@ -478,7 +477,6 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
for (c=0; p<e; c++) { for (c=0; p<e; c++) {
int n = rb_enc_mbclen(p, e, enc); int n = rb_enc_mbclen(p, e, enc);
if (n == 0) return -1;
p += n; p += n;
} }
return c; return c;
@ -487,19 +485,39 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
int int
rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
{ {
int n = ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
if (n == 0) { if (MBCLEN_CHARFOUND(n))
rb_raise(rb_eArgError, "invalid mbstring sequence"); return n;
} else
return n; return 1;
} }
int int
rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
{ {
if (e <= p)
return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
} }
int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc)
{
int c, l;
if (e <= p)
return -1;
if (rb_enc_asciicompat(enc)) {
c = (unsigned char)*p;
return ISASCII(c) ? c : -1;
}
l = rb_enc_precise_mbclen(p, e, enc);
if (!MBCLEN_CHARFOUND(l))
return -1;
c = rb_enc_codepoint(p, e, enc);
if (rb_enc_isascii(c, enc))
return c;
return -1;
}
int int
rb_enc_codelen(int c, rb_encoding *enc) rb_enc_codelen(int c, rb_encoding *enc)
{ {

View file

@ -1,5 +1,4 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
# -*- coding: utf-8 -*-
require 'tk' require 'tk'
require 'tkextlib/vu/charts' require 'tkextlib/vu/charts'

View file

@ -77,6 +77,9 @@ int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*);
#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret) #define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret) #define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
/* ptr,endptr,encoding -> 0x00..0x7f, -1 */
int rb_enc_get_ascii(const char*, const char *, rb_encoding*);
/* code,encoding -> codelen */ /* code,encoding -> codelen */
int rb_enc_codelen(int, rb_encoding*); int rb_enc_codelen(int, rb_encoding*);

View file

@ -29,7 +29,6 @@ extern "C" {
ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding;
#define ismbchar(p, e, enc) (mbclen((p),(e),(enc)) != 1)
#define mbclen(p,e,enc) rb_enc_mbclen((p),(e),(enc)) #define mbclen(p,e,enc) rb_enc_mbclen((p),(e),(enc))
#endif /* ifndef ONIG_RUBY_M17N */ #endif /* ifndef ONIG_RUBY_M17N */

22
parse.y
View file

@ -4583,10 +4583,12 @@ ripper_dispatch_delayed_token(struct parser_params *parser, int t)
#endif #endif
#define parser_mbclen() mbclen((lex_p-1),lex_pend,parser->enc) #define parser_mbclen() mbclen((lex_p-1),lex_pend,parser->enc)
#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || ismbchar(p,e,enc)) #define parser_precise_mbclen() rb_enc_precise_mbclen((lex_p-1),lex_pend,parser->enc)
#define parser_ismbchar() ismbchar((lex_p-1), lex_pend, parser->enc) #define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || !ISASCII(*p))
#define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),lex_pend,parser->enc)) #define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),lex_pend,parser->enc))
#define parser_isascii() ISASCII(*(lex_p-1))
static int static int
parser_yyerror(struct parser_params *parser, const char *msg) parser_yyerror(struct parser_params *parser, const char *msg)
{ {
@ -5305,8 +5307,8 @@ dispose_string(VALUE str)
static int static int
parser_tokadd_mbchar(struct parser_params *parser, int c) parser_tokadd_mbchar(struct parser_params *parser, int c)
{ {
int len = parser_mbclen(); int len = parser_precise_mbclen();
if (len <= 0 || lex_p + len - 1 > lex_pend) { if (!MBCLEN_CHARFOUND(len)) {
compile_error(PARSER_ARG "illegal multibyte char"); compile_error(PARSER_ARG "illegal multibyte char");
return -1; return -1;
} }
@ -5414,7 +5416,7 @@ parser_tokadd_string(struct parser_params *parser,
} }
} }
} }
else if (parser_ismbchar()) { else if (!parser_isascii()) {
has_nonascii = 1; has_nonascii = 1;
if (enc != *encp) { if (enc != *encp) {
mixed_error(enc, *encp); mixed_error(enc, *encp);
@ -6306,7 +6308,7 @@ parser_yylex(struct parser_params *parser)
} }
newtok(); newtok();
enc = parser->enc; enc = parser->enc;
if (parser_ismbchar()) { if (!parser_isascii()) {
if (tokadd_mbchar(c) == -1) return 0; if (tokadd_mbchar(c) == -1) return 0;
} }
else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && else if ((rb_enc_isalnum(c, parser->enc) || c == '_') &&
@ -6889,7 +6891,7 @@ parser_yylex(struct parser_params *parser)
} }
else { else {
term = nextc(); term = nextc();
if (rb_enc_isalnum(term, parser->enc) || parser_ismbchar()) { if (rb_enc_isalnum(term, parser->enc) || !parser_isascii()) {
yyerror("unknown type of %string"); yyerror("unknown type of %string");
return 0; return 0;
} }
@ -8693,7 +8695,7 @@ is_special_global_name(const char *m, const char *e, rb_encoding *enc)
break; break;
case '-': case '-':
++m; ++m;
if (is_identchar(m, e, enc)) { if (m < e && is_identchar(m, e, enc)) {
if (!ISASCII(*m)) mb = 1; if (!ISASCII(*m)) mb = 1;
m += rb_enc_mbclen(m, e, enc); m += rb_enc_mbclen(m, e, enc);
} }
@ -8776,9 +8778,9 @@ rb_enc_symname_p(const char *name, rb_encoding *enc)
default: default:
localid = !rb_enc_isupper(*m, enc); localid = !rb_enc_isupper(*m, enc);
id: id:
if (*m != '_' && !rb_enc_isalpha(*m, enc) && !ismbchar(m, e, enc)) if (m >= e || (*m != '_' && !rb_enc_isalpha(*m, enc) && ISASCII(*m)))
return Qfalse; return Qfalse;
while (is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc); while (m < e && is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc);
if (localid) { if (localid) {
switch (*m) { switch (*m) {
case '!': case '?': case '=': ++m; case '!': case '?': case '=': ++m;

54
re.c
View file

@ -218,10 +218,12 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
rb_encoding *enc = rb_enc_get(str); rb_encoding *enc = rb_enc_get(str);
const char *p, *pend; const char *p, *pend;
int need_escape = 0; int need_escape = 0;
int c;
p = s; pend = p + len; p = s; pend = p + len;
while (p<pend) { while (p<pend) {
if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) { c = rb_enc_get_ascii(p, pend, enc);
if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) {
need_escape = 1; need_escape = 1;
break; break;
} }
@ -233,29 +235,31 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
else { else {
p = s; p = s;
while (p<pend) { while (p<pend) {
if (*p == '\\') { c = rb_enc_get_ascii(p, pend, enc);
if (c == '\\') {
int n = mbclen(p+1, pend, enc) + 1; int n = mbclen(p+1, pend, enc) + 1;
rb_str_buf_cat(str, p, n); rb_str_buf_cat(str, p, n);
p += n; p += n;
continue; continue;
} }
else if (*p == '/') { else if (c == '/') {
char c = '\\'; char c = '\\';
rb_str_buf_cat(str, &c, 1); rb_str_buf_cat(str, &c, 1);
rb_str_buf_cat(str, p, 1); rb_str_buf_cat(str, p, 1);
} }
else if (ismbchar(p, pend, enc)) { else if (c == -1) {
rb_str_buf_cat(str, p, mbclen(p, pend, enc)); int l = mbclen(p, pend, enc);
p += mbclen(p, pend, enc); rb_str_buf_cat(str, p, l);
p += l;
continue; continue;
} }
else if (rb_enc_isprint(*p, enc)) { else if (rb_enc_isprint(c, enc)) {
rb_str_buf_cat(str, p, 1); rb_str_buf_cat(str, p, 1);
} }
else if (!rb_enc_isspace(*p, enc)) { else if (!rb_enc_isspace(c, enc)) {
char b[8]; char b[8];
sprintf(b, "\\%03o", *p & 0377); sprintf(b, "\\%03o", c);
rb_str_buf_cat(str, b, 4); rb_str_buf_cat(str, b, 4);
} }
else { else {
@ -1377,6 +1381,7 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
char *chbuf = ALLOCA_N(char, chmaxlen); char *chbuf = ALLOCA_N(char, chmaxlen);
int chlen = 0; int chlen = 0;
int byte; int byte;
int l;
memset(chbuf, 0, chmaxlen); memset(chbuf, 0, chmaxlen);
@ -1386,7 +1391,8 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
} }
chbuf[chlen++] = byte; chbuf[chlen++] = byte;
while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) { while (chlen < chmaxlen &&
MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
byte = read_escaped_byte(&p, end, err); byte = read_escaped_byte(&p, end, err);
if (byte == -1) { if (byte == -1) {
return -1; return -1;
@ -1394,11 +1400,11 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
chbuf[chlen++] = byte; chbuf[chlen++] = byte;
} }
if (chlen != mbclen(chbuf, chbuf+chlen, enc)) { l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
if (MBCLEN_INVALID(l)) {
strcpy(err, "invalid multibyte escape"); strcpy(err, "invalid multibyte escape");
return -1; return -1;
} }
if (1 < chlen || (chbuf[0] & 0x80)) { if (1 < chlen || (chbuf[0] & 0x80)) {
rb_str_buf_cat(buf, chbuf, chlen); rb_str_buf_cat(buf, chbuf, chlen);
@ -1515,13 +1521,12 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
char smallbuf[2]; char smallbuf[2];
while (p < end) { while (p < end) {
int chlen = mbclen(p, end, enc); int chlen = rb_enc_precise_mbclen(p, end, enc);
if (!MBCLEN_CHARFOUND(chlen)) {
strcpy(err, "invalid multibyte character");
return -1;
}
if (1 < chlen || (*p & 0x80)) { if (1 < chlen || (*p & 0x80)) {
if (end < p + chlen) {
strcpy(err, "too short multibyte character");
return -1;
}
/* xxx: validate the non-ascii character */
rb_str_buf_cat(buf, p, chlen); rb_str_buf_cat(buf, p, chlen);
p += chlen; p += chlen;
if (*encp == 0) if (*encp == 0)
@ -2093,8 +2098,8 @@ rb_reg_quote(VALUE str)
s = RSTRING_PTR(str); s = RSTRING_PTR(str);
send = s + RSTRING_LEN(str); send = s + RSTRING_LEN(str);
for (; s < send; s++) { for (; s < send; s++) {
c = *s; c = rb_enc_get_ascii(s, send, enc);
if (ismbchar(s, send, enc)) { if (c == -1) {
int n = mbclen(s, send, enc); int n = mbclen(s, send, enc);
while (n-- && s < send) while (n-- && s < send)
@ -2129,8 +2134,8 @@ rb_reg_quote(VALUE str)
t += s - RSTRING_PTR(str); t += s - RSTRING_PTR(str);
for (; s < send; s++) { for (; s < send; s++) {
c = *s; c = rb_enc_get_ascii(s, send, enc);
if (ismbchar(s, send, enc)) { if (c == -1) {
int n = mbclen(s, send, enc); int n = mbclen(s, send, enc);
while (n-- && s < send) while (n-- && s < send)
@ -2397,13 +2402,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
e = s + RSTRING_LEN(str); e = s + RSTRING_LEN(str);
while (s < e) { while (s < e) {
int c = rb_enc_get_ascii(s, e, enc);
char *ss = s++; char *ss = s++;
if (ismbchar(ss, e, enc)) { if (c == -1) {
s += mbclen(ss, e, enc) - 1; s += mbclen(ss, e, enc) - 1;
continue; continue;
} }
if (*ss != '\\' || s == e) continue; if (c != '\\' || s == e) continue;
if (!val) { if (!val) {
val = rb_str_buf_new(ss-p); val = rb_str_buf_new(ss-p);

View file

@ -2725,9 +2725,6 @@ rb_str_reverse(VALUE str)
while (s < e) { while (s < e) {
int clen = rb_enc_mbclen(s, e, enc); int clen = rb_enc_mbclen(s, e, enc);
if (clen == 0) {
rb_raise(rb_eArgError, "invalid mbstring sequence");
}
p -= clen; p -= clen;
memcpy(p, s, clen); memcpy(p, s, clen);
s += clen; s += clen;
@ -4079,7 +4076,10 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
beg = start; beg = start;
} }
else { else {
start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); if (RSTRING_PTR(str)+start == RSTRING_END(str))
start++;
else
start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
last_null = 1; last_null = 1;
continue; continue;
} }

View file

@ -77,8 +77,8 @@ class TestM17N < Test::Unit::TestCase
assert_raise(SyntaxError) { eval('/\xc2/u') } assert_raise(SyntaxError) { eval('/\xc2/u') }
assert_raise(SyntaxError) { eval('/\xe0\x80/u') } assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
#assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
#assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit # raw 8bit
assert_raise(SyntaxError) { eval("/\xfe/e") } assert_raise(SyntaxError) { eval("/\xfe/e") }
@ -87,7 +87,7 @@ class TestM17N < Test::Unit::TestCase
# invalid suffix # invalid suffix
assert_raise(SyntaxError) { eval('/\xc2\xff/u') } assert_raise(SyntaxError) { eval('/\xc2\xff/u') }
assert_raise(SyntaxError) { eval('/\xc2 /u') } assert_raise(SyntaxError) { eval('/\xc2 /u') }
#assert_raise(SyntaxError) { eval('/\xc2\x20/u') } assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
end end
def assert_regexp_generic_encoding(r) def assert_regexp_generic_encoding(r)

View file

@ -20,7 +20,7 @@ class TestRegexp < Test::Unit::TestCase
def test_yoshidam_net_20041111_2 def test_yoshidam_net_20041111_2
assert_raise(RegexpError) do assert_raise(RegexpError) do
s = "[\xFF-\xFF]" s = "[\xFF-\xFF]".force_encoding("utf-8")
Regexp.new(s, nil, "u") Regexp.new(s, nil, "u")
end end
end end
@ -42,8 +42,8 @@ class TestRegexp < Test::Unit::TestCase
assert_equal :ok, begin assert_equal :ok, begin
Regexp.union( Regexp.union(
"a", "a",
Regexp.new("\x80".force_encoding("euc-jp")), Regexp.new("\xc2\xa1".force_encoding("euc-jp")),
Regexp.new("\x80".force_encoding("utf-8"))) Regexp.new("\xc2\xa1".force_encoding("utf-8")))
:ng :ng
rescue ArgumentError rescue ArgumentError
:ok :ok