diff --git a/ChangeLog b/ChangeLog index b266f9c3c2..48343fc32b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Thu Nov 8 07:54:22 UTC 2007 David Flanagan + * parse.y: patch, based on Nobu's, work to support \u escapes + also modifications for better coderange detection + * test/ruby/test_unicode_escapes.rb: test cases + * test/ruby/test_mixed_unicode_escapes.rb: mixed encoding test cases + Thu Nov 8 07:14:37 UTC 2007 David Flanagan * parse.y (rb_intern3): commented out broken code that prevented correct interning of multi-byte symbols. Without this patch diff --git a/parse.y b/parse.y index ff50c13a7e..dc20c59191 100644 --- a/parse.y +++ b/parse.y @@ -237,6 +237,7 @@ struct parser_params { int has_shebang; int parser_ruby_sourceline; /* current line no. */ rb_encoding *enc; + rb_encoding *utf8; #ifndef RIPPER /* Ruby core only */ @@ -260,10 +261,12 @@ struct parser_params { #endif }; +#define UTF8_ENC() (parser->utf8 ? parser->utf8 : \ + (parser->utf8 = rb_enc_find("utf-8"))) #define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc) #define STR_NEW0() rb_str_new(0,0) #define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc) -#define STR_NEW3(p,n,m) parser_str_new((p),(n),STR_ENC(!ENC_SINGLE(m)),(m)) +#define STR_NEW3(p,n,e,has8,hasmb) parser_str_new2((p),(n),(e),(has8),(hasmb)) #define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0)) #define ENC_SINGLE(cr) ((cr)==ENC_CODERANGE_SINGLE) #define TOK_INTERN(mb) rb_intern3(tok(), toklen(), STR_ENC(mb)) @@ -4493,20 +4496,25 @@ none : /* none */ # define yylval (*((YYSTYPE*)(parser->parser_yylval))) static int parser_regx_options(struct parser_params*); -static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*); +static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*,int*,rb_encoding**); +static void parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc); static int parser_parse_string(struct parser_params*,NODE*); static int parser_here_document(struct parser_params*,NODE*); + # define nextc() parser_nextc(parser) # define pushback(c) parser_pushback(parser, c) # define newtok() parser_newtok(parser) # define tokspace(n) parser_tokspace(parser, n) # define tokadd(c) parser_tokadd(parser, c) -# define read_escape(m) parser_read_escape(parser, m) -# define tokadd_escape(t,m) parser_tokadd_escape(parser, t, m) +# define tok_hex(numlen) parser_tok_hex(parser, numlen) +# define tok_utf8(numlen,e) parser_tok_utf8(parser, numlen, e) +# define read_escape(flags,has8,hasmb,e) parser_read_escape(parser, flags, has8, hasmb, e) +# define tokadd_escape(t,has8,hasmb,e) parser_tokadd_escape(parser, t, has8,hasmb, e) # define regx_options() parser_regx_options(parser) -# define tokadd_string(f,t,p,n,m) parser_tokadd_string(parser,f,t,p,n,m) +# define tokadd_string(f,t,p,n,has8bit,hasmb,e) parser_tokadd_string(parser,f,t,p,n,has8bit,hasmb,e) # define parse_string(n) parser_parse_string(parser,n) +# define tokaddmbc(c, enc) parser_tokaddmbc(parser, c, enc) # define here_document(n) parser_here_document(parser,n) # define heredoc_identifier() parser_heredoc_identifier(parser) # define heredoc_restore(n) parser_heredoc_restore(parser,n) @@ -4829,6 +4837,15 @@ parser_str_new(const char *p, long n, rb_encoding *enc, int coderange) return str; } +static VALUE +parser_str_new2(const char *p, long n, rb_encoding *enc, int has8bit,int hasmb) +{ + int coderange = ENC_CODERANGE_SINGLE; + if (hasmb) coderange = ENC_CODERANGE_MULTI; + else if (has8bit) coderange = ENC_CODERANGE_UNKNOWN; + return parser_str_new(p, n, enc, coderange); +} + static inline int parser_nextc(struct parser_params *parser) { @@ -4943,10 +4960,145 @@ parser_tokadd(struct parser_params *parser, int c) } static int -parser_read_escape(struct parser_params *parser, int *mb) +parser_tok_hex(struct parser_params *parser, int *numlen) { int c; + c = scan_hex(lex_p, 2, numlen); + if (!*numlen) { + yyerror("invalid hex escape"); + return 0; + } + lex_p += *numlen; + return c; +} + +#if 0 +static int +parser_tok_utf8(struct parser_params *parser, int *numlen, rb_encoding **encp) +{ + int codepoint; + + if (peek('{')) { /* handle \u{...} form */ + nextc(); + codepoint = scan_hex(lex_p, 6, numlen); + if (*numlen == 0) { + yyerror("invalid Unicode escape"); + return 0; + } + if (codepoint > 0x10ffff) { + yyerror("illegal Unicode codepoint (too large)"); + return 0; + } + lex_p += *numlen; + if (!peek('}')) { + yyerror("unterminated Unicode escape"); + return 0; + } + nextc(); + } + else { /* handle \uxxxx form */ + codepoint = scan_hex(lex_p, 4, numlen); + if (*numlen < 4) { + yyerror("invalid Unicode escape"); + return 0; + } + lex_p += 4; + } + if (codepoint >= 0x80) { + *encp = UTF8_ENC(); + } + + return codepoint; +} +#endif + + + +static int +parser_tokadd_utf8(struct parser_params *parser, int *hasmb, + rb_encoding **encp, int string_literal, int symbol_literal) +{ + /* + * If string_literal is true, then we allow multiple codepoints + * in \u{}, and add the codepoints to the current token. + * Otherwise we're parsing a character literal and return a single + * codepoint without adding it + */ + + int codepoint; + int numlen; + + if (peek('{')) { /* handle \u{...} form */ + do { + nextc(); + codepoint = scan_hex(lex_p, 6, &numlen); + if (numlen == 0) { + yyerror("invalid Unicode escape"); + return 0; + } + if (codepoint > 0x10ffff) { + yyerror("illegal Unicode codepoint (too large)"); + return 0; + } + lex_p += numlen; + if (codepoint >= 0x80) { + *hasmb = 1; + *encp = UTF8_ENC(); + if (string_literal) tokaddmbc(codepoint, *encp); + } + else if (string_literal) { + if (codepoint == 0 && symbol_literal) { + yyerror("symbol cannot contain '\\u{0}'"); + return 0; + } + + tokadd(codepoint); + } + } while(string_literal && (peek(' ') || peek('\t'))); + + if (!peek('}')) { + yyerror("unterminated Unicode escape"); + return 0; + } + + nextc(); + } + else { /* handle \uxxxx form */ + codepoint = scan_hex(lex_p, 4, &numlen); + if (numlen < 4) { + yyerror("invalid Unicode escape"); + return 0; + } + lex_p += 4; + if (codepoint >= 0x80) { + *hasmb = 1; + *encp = UTF8_ENC(); + if (string_literal) tokaddmbc(codepoint, *encp); + } + else if (string_literal) { + if (codepoint == 0 && symbol_literal) { + yyerror("symbol cannot contain '\\u0000'"); + return 0; + } + + tokadd(codepoint); + } + } + + return codepoint; +} + +#define ESCAPE_CONTROL 1 +#define ESCAPE_META 2 + +static int +parser_read_escape(struct parser_params *parser, int flags, + int *has8bit, int *hasmb, rb_encoding **encp) +{ + int c; + int numlen; + switch (c = nextc()) { case '\\': /* Backslash */ return c; @@ -4974,6 +5126,7 @@ parser_read_escape(struct parser_params *parser, int *mb) case '0': case '1': case '2': case '3': /* octal constant */ case '4': case '5': case '6': case '7': + if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; { int numlen; @@ -4981,21 +5134,19 @@ parser_read_escape(struct parser_params *parser, int *mb) c = scan_oct(lex_p, 3, &numlen); lex_p += numlen; } - if (mb && (c >= 0200)) *mb = ENC_CODERANGE_UNKNOWN; + if (c >= 0200) *has8bit = 1; return c; case 'x': /* hex constant */ - { - int numlen; + if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; + c = tok_hex(&numlen); + if (numlen == 0) return 0; + if (c >= 0x80) *has8bit = 1; + return c; - c = scan_hex(lex_p, 2, &numlen); - if (numlen == 0) { - yyerror("Invalid escape character syntax"); - return 0; - } - lex_p += numlen; - } - if (mb && (c >= 0x80)) *mb = ENC_CODERANGE_UNKNOWN; + case 'u': /* unicode constant: here only for char literal */ + if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; + c = parser_tokadd_utf8(parser, hasmb, encp, 0, 0); return c; case 'b': /* backspace */ @@ -5005,30 +5156,32 @@ parser_read_escape(struct parser_params *parser, int *mb) return ' '; case 'M': + if (flags & ESCAPE_META) goto eof; if ((c = nextc()) != '-') { - yyerror("Invalid escape character syntax"); pushback(c); - return '\0'; + goto eof; } if ((c = nextc()) == '\\') { - if (mb) *mb = ENC_CODERANGE_UNKNOWN; - return read_escape(0) | 0x80; + *has8bit = 1; + int tmp; + return read_escape(flags|ESCAPE_META, &tmp, &tmp, encp) | 0x80; } else if (c == -1) goto eof; else { - if (mb) *mb = ENC_CODERANGE_UNKNOWN; + *has8bit = 1; return ((c & 0xff) | 0x80); } case 'C': if ((c = nextc()) != '-') { - yyerror("Invalid escape character syntax"); pushback(c); - return '\0'; + goto eof; } case 'c': + if (flags & ESCAPE_CONTROL) goto eof; if ((c = nextc())== '\\') { - c = read_escape(mb); + int tmp; + c = read_escape(flags|ESCAPE_CONTROL, &tmp, &tmp, encp); } else if (c == '?') return 0177; @@ -5045,76 +5198,98 @@ parser_read_escape(struct parser_params *parser, int *mb) } } +#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n)) + +static void +parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc) +{ + int len = rb_enc_codelen(c, enc); + rb_enc_mbcput(c, tokspace(len), enc); +} + static int -parser_tokadd_escape(struct parser_params *parser, int term, int *mb) +parser_tokadd_escape(struct parser_params *parser, int term, + int *has8bit, int *hasmb, rb_encoding **encp) { int c; + int flags = 0; + first: switch (c = nextc()) { case '\n': return 0; /* just ignore */ case '0': case '1': case '2': case '3': /* octal constant */ case '4': case '5': case '6': case '7': + if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; { int numlen; int oct; - tokadd('\\'); - pushback(c); - oct = scan_oct(lex_p, 3, &numlen); - if (numlen == 0) { - yyerror("Invalid escape character syntax"); - return -1; - } - while (numlen--) - tokadd(nextc()); - if (mb && (oct >= 0200)) *mb = ENC_CODERANGE_UNKNOWN; + oct = scan_oct(--lex_p, 3, &numlen); + if (numlen == 0) goto eof; + lex_p += numlen; + tokcopy(numlen + 1); + if (oct >= 0200) *has8bit = 1; } return 0; case 'x': /* hex constant */ + if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; { int numlen; int hex; - tokadd('\\'); - tokadd(c); - hex = scan_hex(lex_p, 2, &numlen); - if (numlen == 0) { - yyerror("Invalid escape character syntax"); - return -1; - } - while (numlen--) - tokadd(nextc()); - if (mb && (hex >= 0x80)) *mb = ENC_CODERANGE_UNKNOWN; + hex = tok_hex(&numlen); + if (numlen == 0) goto eof; + lex_p += numlen; + tokcopy(numlen + 2); + if (hex >= 0x80) *has8bit = ENC_CODERANGE_UNKNOWN; } return 0; - case 'M': - if ((c = nextc()) != '-') { - yyerror("Invalid escape character syntax"); - pushback(c); - return 0; +#if 0 + case 'u': /* Unicode constant */ + if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof; + { + int numlen; + int uc; + + uc = tok_utf8(&numlen, encp); + if (numlen == 0) goto eof; + tokaddmbc(uc, *encp); + if (uc >= 0x80) *hasmb = 1; } - tokadd('\\'); tokadd('M'); tokadd('-'); - if (mb) *mb = ENC_CODERANGE_UNKNOWN; + return 0; +#endif + + case 'M': + if (flags & ESCAPE_META) goto eof; + if ((c = nextc()) != '-') { + pushback(c); + goto eof; + } + tokcopy(3); + *has8bit = 1; + flags |= ESCAPE_META; goto escaped; case 'C': + if (flags & ESCAPE_CONTROL) goto eof; if ((c = nextc()) != '-') { - yyerror("Invalid escape character syntax"); pushback(c); - return 0; + goto eof; } - tokadd('\\'); tokadd('C'); tokadd('-'); + tokcopy(3); goto escaped; case 'c': - tokadd('\\'); tokadd('c'); + if (flags & ESCAPE_CONTROL) goto eof; + tokcopy(2); + flags |= ESCAPE_CONTROL; escaped: if ((c = nextc()) == '\\') { - return tokadd_escape(term, mb); + goto first; } else if (c == -1) goto eof; tokadd(c); @@ -5195,18 +5370,40 @@ static void parser_tokadd_mbchar(struct parser_params *parser, int c) { int len = parser_mbclen(); - do { - tokadd(c); - } while (--len > 0 && (c = nextc()) != -1); + tokadd(c); + lex_p += --len; + if (len > 0) tokcopy(len); } #define tokadd_mbchar(c) parser_tokadd_mbchar(parser, c) static int parser_tokadd_string(struct parser_params *parser, - int func, int term, int paren, long *nest, int *mb) + int func, int term, int paren, long *nest, + int *has8bit, int *hasmb, rb_encoding **encp) { int c; + int has_mb = 0; + rb_encoding *enc = *encp; + char *errbuf = 0; + static const char mixed_msg[] = "%s mixed within %s source"; + +#define mixed_error(enc1, enc2) if (!errbuf) { \ + int len = sizeof(mixed_msg) - 4; \ + len += strlen(rb_enc_name(enc1)); \ + len += strlen(rb_enc_name(enc2)); \ + errbuf = ALLOCA_N(char, len); \ + snprintf(errbuf, len, mixed_msg, \ + rb_enc_name(enc1), \ + rb_enc_name(enc2)); \ + yyerror(errbuf); \ + } +#define mixed_escape(beg, enc1, enc2) do { \ + const char *pos = lex_p; \ + lex_p = beg; \ + mixed_error(enc1, enc2); \ + lex_p = pos; \ + } while (0) while ((c = nextc()) != -1) { if (paren && c == paren) { @@ -5227,6 +5424,7 @@ parser_tokadd_string(struct parser_params *parser, } } else if (c == '\\') { + const char *beg = lex_p - 1; c = nextc(); switch (c) { case '\n': @@ -5239,17 +5437,43 @@ parser_tokadd_string(struct parser_params *parser, if (func & STR_FUNC_ESCAPE) tokadd(c); break; + case 'u': + if ((func & STR_FUNC_EXPAND) == 0) { + tokadd('\\'); + break; + } + parser_tokadd_utf8(parser, hasmb, &enc, 1, + func & STR_FUNC_SYMBOL); + if (has_mb && enc != *encp) { + mixed_escape(beg, enc, *encp); + } + continue; + default: if (func & STR_FUNC_REGEXP) { pushback(c); - if (tokadd_escape(term, mb) < 0) + if ((c = tokadd_escape(term, has8bit, hasmb, &enc)) < 0) return -1; + if (has_mb && enc != *encp) { + mixed_escape(beg, enc, *encp); + } continue; } else if (func & STR_FUNC_EXPAND) { + int tmb = 0; pushback(c); if (func & STR_FUNC_ESCAPE) tokadd('\\'); - c = read_escape(mb); + c = read_escape(0, has8bit, &tmb, &enc); + if (tmb) { + *hasmb = tmb; + if (has_mb && enc != *encp) { + mixed_escape(beg, enc, *encp); + } + else { + tokaddmbc(c, enc); + } + continue; + } } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { /* ignore backslashed spaces in %w */ @@ -5260,8 +5484,13 @@ parser_tokadd_string(struct parser_params *parser, } } else if (parser_ismbchar()) { + has_mb = 1; + if (enc != *encp) { + mixed_error(enc, *encp); + continue; + } tokadd_mbchar(c); - if (mb) *mb = ENC_CODERANGE_MULTI; + if (hasmb) *hasmb = 1; continue; } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { @@ -5275,6 +5504,7 @@ parser_tokadd_string(struct parser_params *parser, } tokadd(c); } + *encp = enc; return c; } @@ -5287,7 +5517,8 @@ parser_parse_string(struct parser_params *parser, NODE *quote) int func = quote->nd_func; int term = nd_term(quote); int paren = nd_paren(quote); - int c, space = 0, mb = ENC_CODERANGE_SINGLE; + int c, space = 0, has8bit=0, hasmb=0; + rb_encoding *enc = parser->enc; if (func == -1) return tSTRING_END; c = nextc(); @@ -5321,21 +5552,21 @@ parser_parse_string(struct parser_params *parser, NODE *quote) tokadd('#'); } pushback(c); - if (tokadd_string(func, term, paren, "e->nd_nest, &mb) == -1) { + if (tokadd_string(func, term, paren, "e->nd_nest, + &has8bit, &hasmb, &enc) == -1) { + ruby_sourceline = nd_line(quote); if (func & STR_FUNC_REGEXP) { - ruby_sourceline = nd_line(quote); compile_error(PARSER_ARG "unterminated regexp meets end of file"); return tREGEXP_END; } else { - ruby_sourceline = nd_line(quote); compile_error(PARSER_ARG "unterminated string meets end of file"); return tSTRING_END; } } tokfix(); - set_yylval_str(STR_NEW3(tok(), toklen(), mb)); + set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb)); return tSTRING_CONTENT; } @@ -5498,7 +5729,9 @@ parser_here_document(struct parser_params *parser, NODE *here) } while (!whole_match_p(eos, len, indent)); } else { - int mb = ENC_CODERANGE_SINGLE, *mbp = &mb; + /* int mb = ENC_CODERANGE_SINGLE, *mbp = &mb;*/ + int has8bit=0, hasmb=0; + rb_encoding *enc = parser->enc; newtok(); if (c == '#') { switch (c = nextc()) { @@ -5513,16 +5746,17 @@ parser_here_document(struct parser_params *parser, NODE *here) } do { pushback(c); - if ((c = tokadd_string(func, '\n', 0, NULL, mbp)) == -1) goto error; + if ((c = tokadd_string(func, '\n', 0, NULL, + &has8bit, &hasmb, &enc)) == -1) goto error; if (c != '\n') { - set_yylval_str(STR_NEW3(tok(), toklen(), mb)); + set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit,hasmb)); return tSTRING_CONTENT; } tokadd(nextc()); - if (mbp && mb == ENC_CODERANGE_UNKNOWN) mbp = 0; + /* if (mbp && mb == ENC_CODERANGE_UNKNOWN) mbp = 0;*/ if ((c = nextc()) == -1) goto error; } while (!whole_match_p(eos, len, indent)); - str = STR_NEW3(tok(), toklen(), mb); + str = STR_NEW3(tok(), toklen(), enc, has8bit,hasmb); } heredoc_restore(lex_strterm); lex_strterm = NEW_STRTERM(-1, 0, 0); @@ -5782,6 +6016,8 @@ parser_yylex(struct parser_params *parser) int space_seen = 0; int cmd_state; enum lex_state_e last_state; + rb_encoding *enc; + int has8bit = 0, hasmb = 0; int mb; #ifdef RIPPER int fallthru = Qfalse; @@ -6123,25 +6359,28 @@ parser_yylex(struct parser_params *parser) return '?'; } newtok(); + enc = parser->enc; if (parser_ismbchar()) { - mb = ENC_CODERANGE_MULTI; + hasmb = 1; tokadd_mbchar(c); } else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) { goto ternary; } - else if (c == '\\' && (c = read_escape(0)) >= 0x80) { - rb_encoding *enc = parser->enc; - mb = ENC_CODERANGE_UNKNOWN; - rb_enc_mbcput(c, tokspace(rb_enc_codelen(c, enc)), enc); + else if (c == '\\' && (c = read_escape(0, &has8bit, &hasmb, &enc)) >= 0x80) { + if (hasmb) { + tokaddmbc(c, enc); + } + else { + tokadd(c); + } } else { - mb = ENC_CODERANGE_SINGLE; tokadd(c); } tokfix(); - set_yylval_str(STR_NEW3(tok(), toklen(), mb)); + set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb)); lex_state = EXPR_ENDARG; return tCHAR; @@ -7211,6 +7450,17 @@ list_concat_gen(struct parser_params *parser, NODE *head, NODE *tail) return head; } +static void +literal_concat0(struct parser_params *parser, VALUE head, VALUE tail) +{ + if (!rb_enc_compatible(head, tail)) { + compile_error(PARSER_ARG "string literal encodings differ (%s / %s)", + rb_enc_name(rb_enc_get(head)), + rb_enc_name(rb_enc_get(tail))); + } + rb_str_buf_append(head, tail); +} + /* concat two string literals */ static NODE * literal_concat_gen(struct parser_params *parser, NODE *head, NODE *tail) @@ -7228,7 +7478,7 @@ literal_concat_gen(struct parser_params *parser, NODE *head, NODE *tail) switch (nd_type(tail)) { case NODE_STR: if (htype == NODE_STR) { - rb_str_concat(head->nd_lit, tail->nd_lit); + literal_concat0(parser, head->nd_lit, tail->nd_lit); rb_gc_force_recycle((VALUE)tail); } else { @@ -7238,7 +7488,7 @@ literal_concat_gen(struct parser_params *parser, NODE *head, NODE *tail) case NODE_DSTR: if (htype == NODE_STR) { - rb_str_concat(head->nd_lit, tail->nd_lit); + literal_concat0(parser, head->nd_lit, tail->nd_lit); tail->nd_lit = head->nd_lit; rb_gc_force_recycle((VALUE)head); head = tail; diff --git a/test/ruby/test_mixed_unicode_escapes.rb b/test/ruby/test_mixed_unicode_escapes.rb new file mode 100644 index 0000000000..e80e6fb14e --- /dev/null +++ b/test/ruby/test_mixed_unicode_escapes.rb @@ -0,0 +1,25 @@ +# -*- coding: sjis -*- +# This test is in a differnt file than TestUnicodeEscapes +# So that we can have a different coding comment above + +require 'test/unit' + +class TestMixedUnicodeEscape < Test::Unit::TestCase + def test_basic + # Unicode escapes do work in an sjis encoded file, but only + # if they don't contain other multi-byte chars + assert_equal("A", "\u0041") + # 8-bit character escapes are okay. + assert_equal("B\xFF", "\u0042\xFF") + + # sjis mb chars mixed with Unicode shound not work + assert_raise(SyntaxError) { eval %q("\u1234")} + assert_raise(SyntaxError) { eval %q("\u{1234}")} + + # String interpolation turns into an expression and we get + # a different kind of error, but we still can't mix these + assert_raise(ArgumentError) { eval %q("\u{1234}#{nil}")} + assert_raise(ArgumentError) { eval %q("#{nil}\u1234")} + + end +end diff --git a/test/ruby/test_unicode_escape.rb b/test/ruby/test_unicode_escape.rb new file mode 100644 index 0000000000..46413cdcdb --- /dev/null +++ b/test/ruby/test_unicode_escape.rb @@ -0,0 +1,240 @@ +# -*- coding: utf-8 -*- + +require 'test/unit' + +class TestUnicodeEscape < Test::Unit::TestCase + def test_basic + assert_equal('Matz - 松本行弘', + "Matz - \u677E\u672C\u884C\u5F18") + assert_equal('Matz - まつもと ゆきひろ', + "Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D") + assert_equal('Matz - まつもと ゆきひろ', + "Matz - \u{307E}\u{3064}\u{3082}\u{3068} \u{3086}\u{304D}\u{3072}\u{308D}") + assert_equal('Matz - まつもと ゆきひろ', + "Matz - \u{307E 3064 3082 3068 20 3086 304D 3072 308D}") + assert_equal("Aoyama Gakuin University - \xE9\x9D\x92\xE5\xB1\xB1\xE5\xAD\xA6\xE9\x99\xA2\xE5\xA4\xA7\xE5\xAD\xA6", + "Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66") + assert_equal('Aoyama Gakuin University - 青山学院大学', + "Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66") + assert_equal('青山学院大学', "\u9752\u5C71\u5B66\u9662\u5927\u5B66") + assert_equal("Martin D\xC3\xBCrst", "Martin D\u00FCrst") + assert_equal('Martin Dürst', "Martin D\u00FCrst") + assert_equal('ü', "\u00FC") + assert_equal("Martin D\xC3\xBCrst", "Martin D\u{FC}rst") + assert_equal('Martin Dürst', "Martin D\u{FC}rst") + assert_equal('ü', "\u{FC}") + assert_equal('ü', %Q|\u{FC}|) + assert_equal('ü', %W{\u{FC}}[0]) + + # \u escapes in here documents + assert_equal('Matz - まつもと ゆきひろ', <, 3) + end + + def test_syntax_variants + # all hex digits + assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89AB\uCDEF") + assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89AB\uCDEF") + assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89ab\ucdef") + assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89ab\ucdef") + assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89aB\uCdEf") + assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89aB\ucDEF") + end + + def test_fulton + # examples from Hal Fulton's book (second edition), chapter 4 + # precomposed e'pe'e + assert_equal('épée', "\u00E9\u0070\u00E9\u0065") + assert_equal('épée', "\u00E9p\u00E9e") + assert_equal("\xC3\xA9\x70\xC3\xA9\x65", "\u00E9\u0070\u00E9\u0065") + assert_equal("\xC3\xA9\x70\xC3\xA9\x65", "\u00E9p\u00E9e") + # decomposed e'pe'e + assert_equal('épée', "\u0065\u0301\u0070\u0065\u0301\u0065") + assert_equal('épée', "e\u0301pe\u0301e") + assert_equal("\x65\xCC\x81\x70\x65\xCC\x81\x65", "\u0065\u0301\u0070\u0065\u0301\u0065") + assert_equal("\x65\xCC\x81\x70\x65\xCC\x81\x65", "e\u0301pe\u0301e") + # combinations of NFC/D, NFKC/D + assert_equal('öffnen', "\u00F6\u0066\u0066\u006E\u0065\u006E") + assert_equal("\xC3\xB6ffnen", "\u00F6\u0066\u0066\u006E\u0065\u006E") + assert_equal('öffnen', "\u00F6ffnen") + assert_equal("\xC3\xB6ffnen", "\u00F6ffnen") + assert_equal('öffnen', "\u006F\u0308\u0066\u0066\u006E\u0065\u006E") + assert_equal("\x6F\xCC\x88ffnen", "\u006F\u0308\u0066\u0066\u006E\u0065\u006E") + assert_equal('öffnen', "o\u0308ffnen") + assert_equal("\x6F\xCC\x88ffnen", "o\u0308ffnen") + assert_equal('öffnen', "\u00F6\uFB00\u006E\u0065\u006E") + assert_equal("\xC3\xB6\xEF\xAC\x80nen", "\u00F6\uFB00\u006E\u0065\u006E") + assert_equal('öffnen', "\u00F6\uFB00nen") + assert_equal("\xC3\xB6\xEF\xAC\x80nen", "\u00F6\uFB00nen") + assert_equal('öffnen', "\u006F\u0308\uFB00\u006E\u0065\u006E") + assert_equal("\x6F\xCC\x88\xEF\xAC\x80nen", "\u006F\u0308\uFB00\u006E\u0065\u006E") + assert_equal('öffnen', "o\u0308\uFB00nen") + assert_equal("\x6F\xCC\x88\xEF\xAC\x80nen", "o\u0308\uFB00nen") + # German sharp s (sz) + assert_equal('Straße', "\u0053\u0074\u0072\u0061\u00DF\u0065") + assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "\u0053\u0074\u0072\u0061\u00DF\u0065") + assert_equal('Straße', "Stra\u00DFe") + assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "Stra\u00DFe") + assert_equal('Straße', "\u{53}\u{74}\u{72}\u{61}\u{DF}\u{65}") + assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "\u{53}\u{74}\u{72}\u{61}\u{DF}\u{65}") + assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "\u{53 74 72 61 DF 65}") + assert_equal('Straße', "Stra\u{DF}e") + assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "Stra\u{DF}e") + end + + def test_edge_cases + # start and end of each outer plane + assert_equal("\xF4\x8F\xBF\xBF", "\u{10FFFF}") + assert_equal("\xF4\x80\x80\x80", "\u{100000}") + assert_equal("\xF3\xBF\xBF\xBF", "\u{FFFFF}") + assert_equal("\xF3\xB0\x80\x80", "\u{F0000}") + assert_equal("\xF3\xAF\xBF\xBF", "\u{EFFFF}") + assert_equal("\xF3\xA0\x80\x80", "\u{E0000}") + assert_equal("\xF3\x9F\xBF\xBF", "\u{DFFFF}") + assert_equal("\xF3\x90\x80\x80", "\u{D0000}") + assert_equal("\xF3\x8F\xBF\xBF", "\u{CFFFF}") + assert_equal("\xF3\x80\x80\x80", "\u{C0000}") + assert_equal("\xF2\xBF\xBF\xBF", "\u{BFFFF}") + assert_equal("\xF2\xB0\x80\x80", "\u{B0000}") + assert_equal("\xF2\xAF\xBF\xBF", "\u{AFFFF}") + assert_equal("\xF2\xA0\x80\x80", "\u{A0000}") + assert_equal("\xF2\x9F\xBF\xBF", "\u{9FFFF}") + assert_equal("\xF2\x90\x80\x80", "\u{90000}") + assert_equal("\xF2\x8F\xBF\xBF", "\u{8FFFF}") + assert_equal("\xF2\x80\x80\x80", "\u{80000}") + assert_equal("\xF1\xBF\xBF\xBF", "\u{7FFFF}") + assert_equal("\xF1\xB0\x80\x80", "\u{70000}") + assert_equal("\xF1\xAF\xBF\xBF", "\u{6FFFF}") + assert_equal("\xF1\xA0\x80\x80", "\u{60000}") + assert_equal("\xF1\x9F\xBF\xBF", "\u{5FFFF}") + assert_equal("\xF1\x90\x80\x80", "\u{50000}") + assert_equal("\xF1\x8F\xBF\xBF", "\u{4FFFF}") + assert_equal("\xF1\x80\x80\x80", "\u{40000}") + assert_equal("\xF0\xBF\xBF\xBF", "\u{3FFFF}") + assert_equal("\xF0\xB0\x80\x80", "\u{30000}") + assert_equal("\xF0\xAF\xBF\xBF", "\u{2FFFF}") + assert_equal("\xF0\xA0\x80\x80", "\u{20000}") + assert_equal("\xF0\x9F\xBF\xBF", "\u{1FFFF}") + assert_equal("\xF0\x90\x80\x80", "\u{10000}") + # BMP + assert_equal("\xEF\xBF\xBF", "\uFFFF") + assert_equal("\xEE\x80\x80", "\uE000") + assert_equal("\xED\x9F\xBF", "\uD7FF") + assert_equal("\xE0\xA0\x80", "\u0800") + assert_equal("\xDF\xBF", "\u07FF") + assert_equal("\xC2\x80", "\u0080") + assert_equal("\x7F", "\u007F") + assert_equal("\x00", "\u0000") + end + + def test_chars + assert_equal(?\u0041, ?A) + assert_equal(?\u{79}, ?\x79) + assert_equal(?\u{0}, ?\000) + assert_equal(?\u0000, ?\000) + end + + # Tests to make sure that disallowed cases fail + def test_fail + assert_raise(SyntaxError) { eval %q("\uabc") } # too short + assert_raise(SyntaxError) { eval %q("\uab") } # too short + assert_raise(SyntaxError) { eval %q("\ua") } # too short + assert_raise(SyntaxError) { eval %q("\u") } # too short + assert_raise(SyntaxError) { eval %q("\u{110000}") } # too high + assert_raise(SyntaxError) { eval %q("\u{abcdeff}") } # too long + assert_raise(SyntaxError) { eval %q("\ughij") } # bad hex digits + assert_raise(SyntaxError) { eval %q("\u{ghij}") } # bad hex digits + + assert_raise(SyntaxError) { eval %q("\u{123 456 }")} # extra space + assert_raise(SyntaxError) { eval %q("\u{ 123 456}")} # extra space + assert_raise(SyntaxError) { eval %q("\u{123 456}")} # extra space + +# The utf-8 encoding object currently does not object to codepoints +# in the surrogate blocks, so these do not raise an error. +# assert_raise(SyntaxError) { "\uD800" } # surrogate block +# assert_raise(SyntaxError) { "\uDCBA" } # surrogate block +# assert_raise(SyntaxError) { "\uDFFF" } # surrogate block +# assert_raise(SyntaxError) { "\uD847\uDD9A" } # surrogate pair + + end +end