1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* include/ruby/intern.h (rb_uv_to_utf8): declared.

* re.c (rb_reg_preprocess): new function for dynamic regexp with
  \u{} such as Regexp.new("\\u{6666}").
  (rb_reg_prepare_re): preprocess regexp for recompiling.
  (read_escaped_byte): new function.
  (unescape_escaped_nonascii): new function.
  (append_utf8): new function.
  (unescape_unicode_list): new function.
  (unescape_unicode_bmp): new function.
  (unescape_nonascii): new function.
  (rb_reg_initialize): preprocess regexp.

* pack.c (rb_uv_to_utf8): renamed from uv_to_utf8.

* parse.y (STR_NEW3): take func instead of has8 and hasmb.
  (parser_str_new): use default coderange mechanism except for regexp.
  (parser_tokadd_utf8): copy regexp source as-is.
  (parser_read_escape): UTF-8 stuff removed.
  (parser_tokadd_escape): has8bit and hasmb removed.
  (parser_tokadd_string): fix 8-bit single byte character with \u.
  (parser_parse_string): has8bit and hasmb removed.
  (parser_here_document): has8bit and hasmb removed.
  (parser_yylex): call parser_tokadd_utf8 instead of read_escape for
  UTF-8 character.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14072 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2007-12-01 16:56:19 +00:00
parent d92b461dd9
commit 7ff702406a
7 changed files with 637 additions and 176 deletions

View file

@ -1,3 +1,31 @@
Sun Dec 2 01:39:51 2007 Tanaka Akira <akr@fsij.org>
* include/ruby/intern.h (rb_uv_to_utf8): declared.
* re.c (rb_reg_preprocess): new function for dynamic regexp with
\u{} such as Regexp.new("\\u{6666}").
(rb_reg_prepare_re): preprocess regexp for recompiling.
(read_escaped_byte): new function.
(unescape_escaped_nonascii): new function.
(append_utf8): new function.
(unescape_unicode_list): new function.
(unescape_unicode_bmp): new function.
(unescape_nonascii): new function.
(rb_reg_initialize): preprocess regexp.
* pack.c (rb_uv_to_utf8): renamed from uv_to_utf8.
* parse.y (STR_NEW3): take func instead of has8 and hasmb.
(parser_str_new): use default coderange mechanism except for regexp.
(parser_tokadd_utf8): copy regexp source as-is.
(parser_read_escape): UTF-8 stuff removed.
(parser_tokadd_escape): has8bit and hasmb removed.
(parser_tokadd_string): fix 8-bit single byte character with \u.
(parser_parse_string): has8bit and hasmb removed.
(parser_here_document): has8bit and hasmb removed.
(parser_yylex): call parser_tokadd_utf8 instead of read_escape for
UTF-8 character.
Wed Dec 2 01:00:07 2007 James Edward Gray II <jeg2@ruby-lang.org>
* lib/xmlrpc/server.rb (XMLRPC::Server#server): Improve signal handling so

View file

@ -101,6 +101,7 @@ unsigned LONG_LONG rb_big2ull(VALUE);
#endif /* HAVE_LONG_LONG */
void rb_quad_pack(char*,VALUE);
VALUE rb_quad_unpack(const char*,int);
int rb_uv_to_utf8(char[6],unsigned long);
VALUE rb_dbl2big(double);
double rb_big2dbl(VALUE);
VALUE rb_big_cmp(VALUE, VALUE);

7
pack.c
View file

@ -365,7 +365,6 @@ static const char toofew[] = "too few arguments";
static void encodes(VALUE,const char*,long,int);
static void qpencode(VALUE,VALUE,long);
static int uv_to_utf8(char*,unsigned long);
static unsigned long utf8_to_uv(const char*,long*);
/*
@ -872,7 +871,7 @@ pack_pack(VALUE ary, VALUE fmt)
if (l < 0) {
rb_raise(rb_eRangeError, "pack(U): value out of range");
}
le = uv_to_utf8(buf, l);
le = rb_uv_to_utf8(buf, l);
rb_str_buf_cat(res, (char*)buf, le);
}
break;
@ -1991,8 +1990,8 @@ pack_unpack(VALUE str, VALUE fmt)
#define BYTEWIDTH 8
static int
uv_to_utf8(char *buf, unsigned long uv)
int
rb_uv_to_utf8(char buf[6], unsigned long uv)
{
if (uv <= 0x7f) {
buf[0] = (char)uv;

213
parse.y
View file

@ -269,7 +269,7 @@ struct parser_params {
#define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc)
#define STR_NEW0() rb_str_new(0,0)
#define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc)
#define STR_NEW3(p,n,e,has8,hasmb) parser_str_new2((p),(n),(e),(has8),(hasmb))
#define STR_NEW3(p,n,e,func) parser_str_new((p),(n),(e),(func))
#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0))
#define ENC_SINGLE(cr) ((cr)==ENC_CODERANGE_7BIT)
#define TOK_INTERN(mb) rb_intern3(tok(), toklen(), STR_ENC(mb))
@ -4488,7 +4488,7 @@ none : /* none */
# define yylval (*((YYSTYPE*)(parser->parser_yylval)))
static int parser_regx_options(struct parser_params*);
static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*,int*,rb_encoding**);
static int parser_tokadd_string(struct parser_params*,int,int,int,long*,rb_encoding**);
static void parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc);
static int parser_parse_string(struct parser_params*,NODE*);
static int parser_here_document(struct parser_params*,NODE*);
@ -4500,11 +4500,10 @@ static int parser_here_document(struct parser_params*,NODE*);
# define tokspace(n) parser_tokspace(parser, n)
# define tokadd(c) parser_tokadd(parser, c)
# define tok_hex(numlen) parser_tok_hex(parser, numlen)
# define tok_utf8(numlen,e) parser_tok_utf8(parser, numlen, e)
# define read_escape(flags,has8,hasmb,e) parser_read_escape(parser, flags, has8, hasmb, e)
# define tokadd_escape(t,has8,hasmb,e) parser_tokadd_escape(parser, t, has8,hasmb, e)
# define read_escape(flags,e) parser_read_escape(parser, flags, e)
# define tokadd_escape(t,e) parser_tokadd_escape(parser, t, e)
# define regx_options() parser_regx_options(parser)
# define tokadd_string(f,t,p,n,has8bit,hasmb,e) parser_tokadd_string(parser,f,t,p,n,has8bit,hasmb,e)
# define tokadd_string(f,t,p,n,e) parser_tokadd_string(parser,f,t,p,n,e)
# define parse_string(n) parser_parse_string(parser,n)
# define tokaddmbc(c, enc) parser_tokaddmbc(parser, c, enc)
# define here_document(n) parser_here_document(parser,n)
@ -4821,37 +4820,39 @@ rb_parser_compile_file(volatile VALUE vparser, const char *f, VALUE file, int st
}
#endif /* !RIPPER */
#define STR_FUNC_ESCAPE 0x01
#define STR_FUNC_EXPAND 0x02
#define STR_FUNC_REGEXP 0x04
#define STR_FUNC_QWORDS 0x08
#define STR_FUNC_SYMBOL 0x10
#define STR_FUNC_INDENT 0x20
enum string_type {
str_squote = (0),
str_dquote = (STR_FUNC_EXPAND),
str_xquote = (STR_FUNC_EXPAND),
str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND),
str_sword = (STR_FUNC_QWORDS),
str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND),
str_ssym = (STR_FUNC_SYMBOL),
str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND),
};
static VALUE
parser_str_new(const char *p, long n, rb_encoding *enc, int coderange)
parser_str_new(const char *p, long n, rb_encoding *enc, int func)
{
VALUE str = rb_enc_str_new(p, n, enc);
ENC_CODERANGE_SET(str, coderange);
VALUE str;
str = rb_enc_str_new(p, n, enc);
if (!(func & STR_FUNC_REGEXP) &&
rb_enc_asciicompat(enc) &&
rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
rb_enc_associate(str, rb_default_encoding());
}
return str;
}
static VALUE
parser_str_new2(const char *p, long n, rb_encoding *enc, int has8bit,int hasmb)
{
/*
* Set coderange bit flags based on the presence of 8-bit and
* multi-byte characters in the string
*/
int coderange = ENC_CODERANGE_7BIT;
if (hasmb) coderange = ENC_CODERANGE_8BIT;
else if (has8bit) coderange = ENC_CODERANGE_UNKNOWN;
/*
* If it is all single byte characters with the 8th bit clear,
* and if the specified encoding is ASCII-compatible, then this
* string is in the ASCII subset, and we just use the ASCII encoding
* instead.
*/
if ((coderange == ENC_CODERANGE_7BIT) && rb_enc_asciicompat(enc))
enc = rb_default_encoding();
return parser_str_new(p, n, enc, coderange);
}
static inline int
parser_nextc(struct parser_params *parser)
{
@ -4979,9 +4980,11 @@ parser_tok_hex(struct parser_params *parser, int *numlen)
return c;
}
#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
static int
parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
rb_encoding **encp, int string_literal, int symbol_literal)
parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
int string_literal, int symbol_literal, int regexp_literal)
{
/*
* If string_literal is true, then we allow multiple codepoints
@ -4993,8 +4996,11 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
int codepoint;
int numlen;
if (regexp_literal) { tokadd('\\'); tokadd('u'); }
if (peek('{')) { /* handle \u{...} form */
do {
if (regexp_literal) { tokadd(*lex_p); }
nextc();
codepoint = scan_hex(lex_p, 6, &numlen);
if (numlen == 0) {
@ -5006,8 +5012,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
return 0;
}
lex_p += numlen;
if (codepoint >= 0x80) {
*hasmb = 1;
if (regexp_literal) {
tokcopy(numlen);
}
else if (codepoint >= 0x80) {
*encp = UTF8_ENC();
if (string_literal) tokaddmbc(codepoint, *encp);
}
@ -5026,6 +5034,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
return 0;
}
if (regexp_literal) { tokadd('}'); }
nextc();
}
else { /* handle \uxxxx form */
@ -5035,8 +5044,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
return 0;
}
lex_p += 4;
if (codepoint >= 0x80) {
*hasmb = 1;
if (regexp_literal) {
tokcopy(4);
}
else if (codepoint >= 0x80) {
*encp = UTF8_ENC();
if (string_literal) tokaddmbc(codepoint, *encp);
}
@ -5058,7 +5069,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
static int
parser_read_escape(struct parser_params *parser, int flags,
int *has8bit, int *hasmb, rb_encoding **encp)
rb_encoding **encp)
{
int c;
int numlen;
@ -5098,19 +5109,12 @@ parser_read_escape(struct parser_params *parser, int flags,
c = scan_oct(lex_p, 3, &numlen);
lex_p += numlen;
}
if (c >= 0200) *has8bit = 1;
return c;
case 'x': /* hex constant */
if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof;
c = tok_hex(&numlen);
if (numlen == 0) return 0;
if (c >= 0x80) *has8bit = 1;
return c;
case 'u': /* unicode constant: here only for char literal */
if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof;
c = parser_tokadd_utf8(parser, hasmb, encp, 0, 0);
return c;
case 'b': /* backspace */
@ -5126,13 +5130,10 @@ parser_read_escape(struct parser_params *parser, int flags,
goto eof;
}
if ((c = nextc()) == '\\') {
int tmp;
*has8bit = 1;
return read_escape(flags|ESCAPE_META, &tmp, &tmp, encp) | 0x80;
return read_escape(flags|ESCAPE_META, encp) | 0x80;
}
else if (c == -1 || !ISASCII(c)) goto eof;
else {
*has8bit = 1;
return ((c & 0xff) | 0x80);
}
@ -5144,8 +5145,7 @@ parser_read_escape(struct parser_params *parser, int flags,
case 'c':
if (flags & ESCAPE_CONTROL) goto eof;
if ((c = nextc())== '\\') {
int tmp;
c = read_escape(flags|ESCAPE_CONTROL, has8bit, &tmp, encp);
c = read_escape(flags|ESCAPE_CONTROL, encp);
}
else if (c == '?')
return 0177;
@ -5162,8 +5162,6 @@ parser_read_escape(struct parser_params *parser, int flags,
}
}
#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
static void
parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc)
{
@ -5173,7 +5171,7 @@ parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc)
static int
parser_tokadd_escape(struct parser_params *parser, int term,
int *has8bit, int *hasmb, rb_encoding **encp)
rb_encoding **encp)
{
int c;
int flags = 0;
@ -5194,7 +5192,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
if (numlen == 0) goto eof;
lex_p += numlen;
tokcopy(numlen + 1);
if (oct >= 0200) *has8bit = 1;
}
return 0;
@ -5207,7 +5204,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
hex = tok_hex(&numlen);
if (numlen == 0) goto eof;
tokcopy(numlen + 2);
if (hex >= 0x80) *has8bit = 1;
}
return 0;
@ -5218,7 +5214,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
goto eof;
}
tokcopy(3);
*has8bit = 1;
flags |= ESCAPE_META;
goto escaped;
@ -5287,24 +5282,6 @@ parser_regx_options(struct parser_params *parser)
return options | RE_OPTION_ENCODING(kcode);
}
#define STR_FUNC_ESCAPE 0x01
#define STR_FUNC_EXPAND 0x02
#define STR_FUNC_REGEXP 0x04
#define STR_FUNC_QWORDS 0x08
#define STR_FUNC_SYMBOL 0x10
#define STR_FUNC_INDENT 0x20
enum string_type {
str_squote = (0),
str_dquote = (STR_FUNC_EXPAND),
str_xquote = (STR_FUNC_EXPAND),
str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND),
str_sword = (STR_FUNC_QWORDS),
str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND),
str_ssym = (STR_FUNC_SYMBOL),
str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND),
};
static void
dispose_string(VALUE str)
{
@ -5328,10 +5305,10 @@ parser_tokadd_mbchar(struct parser_params *parser, int c)
static int
parser_tokadd_string(struct parser_params *parser,
int func, int term, int paren, long *nest,
int *has8bit, int *hasmb, rb_encoding **encp)
rb_encoding **encp)
{
int c;
int has_mb = 0;
int has_nonascii = 0;
rb_encoding *enc = *encp;
char *errbuf = 0;
static const char mixed_msg[] = "%s mixed within %s source";
@ -5390,9 +5367,10 @@ parser_tokadd_string(struct parser_params *parser,
tokadd('\\');
break;
}
parser_tokadd_utf8(parser, hasmb, &enc, 1,
func & STR_FUNC_SYMBOL);
if (has_mb && enc != *encp) {
parser_tokadd_utf8(parser, &enc, 1,
func & STR_FUNC_SYMBOL,
func & STR_FUNC_REGEXP);
if (has_nonascii && enc != *encp) {
mixed_escape(beg, enc, *encp);
}
continue;
@ -5400,28 +5378,17 @@ parser_tokadd_string(struct parser_params *parser,
default:
if (func & STR_FUNC_REGEXP) {
pushback(c);
if ((c = tokadd_escape(term, has8bit, hasmb, &enc)) < 0)
if ((c = tokadd_escape(term, &enc)) < 0)
return -1;
if (has_mb && enc != *encp) {
if (has_nonascii && enc != *encp) {
mixed_escape(beg, enc, *encp);
}
continue;
}
else if (func & STR_FUNC_EXPAND) {
int tmb = 0;
pushback(c);
if (func & STR_FUNC_ESCAPE) tokadd('\\');
c = read_escape(0, has8bit, &tmb, &enc);
if (tmb) {
*hasmb = tmb;
if (has_mb && enc != *encp) {
mixed_escape(beg, enc, *encp);
}
else {
tokaddmbc(c, enc);
}
continue;
}
c = read_escape(0, &enc);
}
else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
/* ignore backslashed spaces in %w */
@ -5432,13 +5399,12 @@ parser_tokadd_string(struct parser_params *parser,
}
}
else if (parser_ismbchar()) {
has_mb = 1;
has_nonascii = 1;
if (enc != *encp) {
mixed_error(enc, *encp);
continue;
}
tokadd_mbchar(c);
if (hasmb) *hasmb = 1;
continue;
}
else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
@ -5450,6 +5416,13 @@ parser_tokadd_string(struct parser_params *parser,
compile_error(PARSER_ARG "symbol cannot contain '\\0'");
continue;
}
if (c & 0x80) {
has_nonascii = 1;
if (enc != *encp) {
mixed_error(enc, *encp);
continue;
}
}
tokadd(c);
}
*encp = enc;
@ -5465,7 +5438,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
int func = quote->nd_func;
int term = nd_term(quote);
int paren = nd_paren(quote);
int c, space = 0, has8bit=0, hasmb=0;
int c, space = 0;
rb_encoding *enc = parser->enc;
if (func == -1) return tSTRING_END;
@ -5501,7 +5474,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
}
pushback(c);
if (tokadd_string(func, term, paren, &quote->nd_nest,
&has8bit, &hasmb, &enc) == -1) {
&enc) == -1) {
ruby_sourceline = nd_line(quote);
if (func & STR_FUNC_REGEXP) {
compile_error(PARSER_ARG "unterminated regexp meets end of file");
@ -5514,7 +5487,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
}
tokfix();
set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb));
set_yylval_str(STR_NEW3(tok(), toklen(), enc, func));
return tSTRING_CONTENT;
}
@ -5678,7 +5651,6 @@ parser_here_document(struct parser_params *parser, NODE *here)
}
else {
/* int mb = ENC_CODERANGE_7BIT, *mbp = &mb;*/
int has8bit=0, hasmb=0;
rb_encoding *enc = parser->enc;
newtok();
if (c == '#') {
@ -5695,16 +5667,16 @@ parser_here_document(struct parser_params *parser, NODE *here)
do {
pushback(c);
if ((c = tokadd_string(func, '\n', 0, NULL,
&has8bit, &hasmb, &enc)) == -1) goto error;
&enc)) == -1) goto error;
if (c != '\n') {
set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit,hasmb));
set_yylval_str(STR_NEW3(tok(), toklen(), enc, func));
return tSTRING_CONTENT;
}
tokadd(nextc());
/* if (mbp && mb == ENC_CODERANGE_UNKNOWN) mbp = 0;*/
if ((c = nextc()) == -1) goto error;
} while (!whole_match_p(eos, len, indent));
str = STR_NEW3(tok(), toklen(), enc, has8bit,hasmb);
str = STR_NEW3(tok(), toklen(), enc, func);
}
heredoc_restore(lex_strterm);
lex_strterm = NEW_STRTERM(-1, 0, 0);
@ -5966,7 +5938,6 @@ parser_yylex(struct parser_params *parser)
int cmd_state;
enum lex_state_e last_state;
rb_encoding *enc;
int has8bit = 0, hasmb = 0;
int mb;
#ifdef RIPPER
int fallthru = Qfalse;
@ -6317,26 +6288,33 @@ parser_yylex(struct parser_params *parser)
newtok();
enc = parser->enc;
if (parser_ismbchar()) {
hasmb = 1;
tokadd_mbchar(c);
}
else if ((rb_enc_isalnum(c, parser->enc) || c == '_') &&
lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) {
goto ternary;
}
else if (c == '\\' && (c = read_escape(0, &has8bit, &hasmb, &enc)) >= 0x80) {
if (hasmb) {
tokaddmbc(c, enc);
}
else {
tokadd(c);
}
}
else {
else if (c == '\\') {
if (peek('u')) {
nextc();
c = parser_tokadd_utf8(parser, &enc, 0, 0, 0);
if (0x80 <= c) {
tokaddmbc(c, enc);
}
else {
tokadd(c);
}
}
else {
c = read_escape(0, &enc);
tokadd(c);
}
}
else {
tokadd(c);
}
}
tokfix();
set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb));
set_yylval_str(STR_NEW3(tok(), toklen(), enc, 0));
lex_state = EXPR_ENDARG;
return tCHAR;
@ -8481,7 +8459,6 @@ reg_compile_gen(struct parser_params* parser, VALUE str, int options)
compile_error(PARSER_ARG "%s", RSTRING_PTR(re));
return Qnil;
}
if (str) rb_enc_copy(re, str);
return re;
}

446
re.c
View file

@ -12,6 +12,7 @@
#include "ruby/ruby.h"
#include "ruby/re.h"
#include "ruby/encoding.h"
#include "ruby/util.h"
#include "regint.h"
#include <ctype.h>
@ -715,6 +716,10 @@ rb_reg_fixed_encoding_p(VALUE re)
return Qfalse;
}
static VALUE
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
rb_encoding **fixed_enc, onig_errmsg_buffer err);
static void
rb_reg_prepare_re(VALUE re, VALUE str)
{
@ -740,13 +745,19 @@ rb_reg_prepare_re(VALUE re, VALUE str)
OnigErrorInfo einfo;
regex_t *reg, *reg2;
UChar *pattern;
VALUE unescaped;
rb_encoding *fixed_enc = 0;
rb_reg_check(re);
reg = RREGEXP(re)->ptr;
pattern = ((UChar*)RREGEXP(re)->str);
r = onig_new(&reg2, (UChar* )pattern,
(UChar* )(pattern + RREGEXP(re)->len),
unescaped = rb_reg_preprocess(
RREGEXP(re)->str, RREGEXP(re)->str + RREGEXP(re)->len, enc,
&fixed_enc, err);
r = onig_new(&reg2, (UChar* )RSTRING_PTR(unescaped),
(UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
reg->options, enc,
OnigDefaultSyntax, &einfo);
if (r) {
@ -756,6 +767,7 @@ rb_reg_prepare_re(VALUE re, VALUE str)
RREGEXP(re)->ptr = reg2;
onig_free(reg);
RB_GC_GUARD(unescaped);
}
}
@ -1235,13 +1247,408 @@ match_inspect(VALUE match)
VALUE rb_cRegexp;
static int
read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
{
const char *p = *pp;
int code;
int meta_prefix = 0, ctrl_prefix = 0;
int len;
int retbyte;
retbyte = -1;
if (p == end || *p++ != '\\') {
strcpy(err, "too short escaped multibyte character");
return -1;
}
again:
if (p == end) {
strcpy(err, "too short escape sequence");
return -1;
}
switch (*p++) {
case '\\': code = '\\'; break;
case 'n': code = '\n'; break;
case 't': code = '\t'; break;
case 'r': code = '\r'; break;
case 'f': code = '\f'; break;
case 'v': code = '\013'; break;
case 'a': code = '\007'; break;
case 'e': code = '\033'; break;
/* \OOO */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
p--;
code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
p += len;
break;
case 'x': /* \xHH */
code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
if (len < 1) {
strcpy(err, "invalid hex escape");
return -1;
}
p += len;
break;
case 'M': /* \M-X, \M-\C-X, \M-\cX */
if (meta_prefix) {
strcpy(err, "duplicate meta escape");
return -1;
}
meta_prefix = 1;
if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
if (*p == '\\') {
p++;
goto again;
}
else {
code = *p++;
break;
}
}
strcpy(err, "too short meta escape");
return -1;
case 'C': /* \C-X, \C-\M-X */
if (p == end || *p++ != '-') {
strcpy(err, "too short control escape");
return -1;
}
case 'c': /* \cX, \c\M-X */
if (ctrl_prefix) {
strcpy(err, "duplicate control escape");
return -1;
}
ctrl_prefix = 1;
if (p < end && (*p & 0x80) == 0) {
if (*p == '\\') {
p++;
goto again;
}
else {
code = *p++;
break;
}
}
strcpy(err, "too short control escape");
return -1;
default:
strcpy(err, "unexpected escape sequence");
return -1;
}
if (code < 0 || 0xff < code) {
strcpy(err, "invalid escape code");
return -1;
}
if (ctrl_prefix)
code &= 0x1f;
if (meta_prefix)
code |= 0x80;
*pp = p;
return code;
}
static int
unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
{
const char *p = *pp;
int chmaxlen = rb_enc_mbmaxlen(enc);
char *chbuf = ALLOCA_N(char, chmaxlen);
int chlen = 0;
int byte;
memset(chbuf, 0, chmaxlen);
byte = read_escaped_byte(&p, end, err);
if (byte == -1) {
return -1;
}
chbuf[chlen++] = byte;
while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
byte = read_escaped_byte(&p, end, err);
if (byte == -1) {
return -1;
}
chbuf[chlen++] = byte;
}
if (chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
strcpy(err, "invalid multibyte escape");
return -1;
}
if (1 < chlen || (chbuf[0] & 0x80)) {
rb_str_buf_cat(buf, chbuf, chlen);
if (*encp == 0)
*encp = enc;
else if (*encp != enc) {
strcpy(err, "character encodings differ");
return -1;
}
}
else {
char escbuf[5];
snprintf(escbuf, sizeof(escbuf), "\\x%02x", chbuf[0]&0xff);
rb_str_buf_cat(buf, escbuf, 4);
}
*pp = p;
return 0;
}
static int
append_utf8(unsigned long uv,
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
{
if (uv < 0x80) {
char escbuf[5];
snprintf(escbuf, sizeof(escbuf), "\\x%02x", (int)uv);
rb_str_buf_cat(buf, escbuf, 4);
}
else {
int len;
char utf8buf[6];
len = rb_uv_to_utf8(utf8buf, uv);
rb_str_buf_cat(buf, utf8buf, len);
if (*encp == 0)
*encp = rb_enc_find("utf-8");
else if (*encp != rb_enc_find("utf-8")) {
strcpy(err, "character encodings differ");
return -1;
}
}
return 0;
}
static int
unescape_unicode_list(const char **pp, const char *end,
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
{
const char *p = *pp;
int has_unicode = 0;
unsigned long code;
int len;
while (p < end && ISSPACE(*p)) p++;
while (1) {
code = ruby_scan_hex(p, end-p, &len);
if (len == 0)
break;
if (6 < len) { /* max 10FFFF */
strcpy(err, "invalid unicode range");
return -1;
}
if (0x10ffff < code) {
strcpy(err, "invalid unicode range");
return -1;
}
p += len;
if (append_utf8(code, buf, encp, err) != 0)
return -1;
has_unicode = 1;
while (p < end && ISSPACE(*p)) p++;
}
if (has_unicode == 0) {
strcpy(err, "invalid unicode list");
return -1;
}
*pp = p;
return 0;
}
static int
unescape_unicode_bmp(const char **pp, const char *end,
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
{
const char *p = *pp;
int len;
unsigned long code;
if (end < p+4) {
strcpy(err, "invalid unicode escape");
return -1;
}
code = ruby_scan_hex(p, 4, &len);
if (len != 4) {
strcpy(err, "invalid unicode escape");
return -1;
}
if (append_utf8(code, buf, encp, err) != 0)
return -1;
*pp = p + 4;
return 0;
}
static int
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
{
char c;
char smallbuf[2];
while (p < end) {
int chlen = mbclen(p, end, enc);
if (1 < chlen || (*p & 0x80)) {
if (end < p + chlen) {
strcpy(err, "too short multibyte character");
return -1;
}
/* xxx: validate the non-ascii character */
rb_str_buf_cat(buf, p, chlen);
p += chlen;
if (*encp == 0)
*encp = enc;
else if (*encp != enc) {
strcpy(err, "character encodings differ");
return -1;
}
continue;
}
switch (c = *p++) {
case '\\':
if (p == end) {
strcpy(err, "too short escape sequence");
return -1;
}
switch (c = *p++) {
case '1': case '2': case '3':
case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
{
int octlen;
if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
/* backref or 7bit octal.
no need to unescape anyway.
re-escaping may break backref */
goto escape_asis;
}
}
/* xxx: How about more than 199 subexpressions? */
case '0': /* \0, \0O, \0OO */
case 'x': /* \xHH */
case 'c': /* \cX, \c\M-X */
case 'C': /* \C-X, \C-\M-X */
case 'M': /* \M-X, \M-\C-X, \M-\cX */
p = p-2;
if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
return -1;
break;
case 'u':
if (p == end) {
strcpy(err, "too short escape sequence");
return -1;
}
if (*p == '{') {
/* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
p++;
if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
return -1;
if (p == end || *p++ != '}') {
strcpy(err, "invalid unicode list");
return -1;
}
break;
}
else {
/* \uHHHH */
if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
return -1;
break;
}
default: /* \n, \\, \d, \9, etc. */
escape_asis:
smallbuf[0] = '\\';
smallbuf[1] = c;
rb_str_buf_cat(buf, smallbuf, 2);
break;
}
break;
default:
rb_str_buf_cat(buf, &c, 1);
break;
}
}
return 0;
}
static VALUE
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
rb_encoding **fixed_enc, onig_errmsg_buffer err)
{
VALUE buf;
buf = rb_str_buf_new(0);
*fixed_enc = 0;
if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
return Qnil;
if (fixed_enc) {
rb_enc_associate(buf, *fixed_enc);
}
return buf;
}
#if 0
static VALUE
rb_reg_preprocess_obj(VALUE str,
rb_encoding **fixed_enc, onig_errmsg_buffer err)
{
VALUE buf;
char *p, *end;
rb_encoding *enc;
StringValue(str);
p = RSTRING_PTR(str);
end = p + RSTRING_LEN(str);
enc = rb_enc_get(str);
buf = rb_reg_preprocess(p, end, enc, fixed_enc, err);
RB_GC_GUARD(str);
return buf;
}
static VALUE
rb_reg_preprocess_m(VALUE klass, VALUE obj)
{
rb_encoding *fixed_enc = 0;
onig_errmsg_buffer err;
VALUE str = rb_reg_preprocess_obj(obj, &fixed_enc, err);
if (str == Qnil)
rb_raise(rb_eArgError, "%s", err);
return rb_assoc_new(str, fixed_enc ? Qtrue : Qfalse);
}
#endif
static int
rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
int options, onig_errmsg_buffer err)
{
struct RRegexp *re = RREGEXP(obj);
int raw8bit;
long i;
VALUE unescaped;
rb_encoding *fixed_enc = 0;
if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
@ -1253,33 +1660,38 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
raw8bit = 0;
for (i = 0; i < len; i++) {
if (s[i] & 0x80) {
raw8bit = 1;
break;
}
unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
if (unescaped == Qnil)
return -1;
if (fixed_enc && (options & ARG_ENCODING_FIXED) && fixed_enc != enc) {
strcpy(err, "character encodings differ");
return -1;
}
if (fixed_enc)
enc = fixed_enc;
else if (!(options & ARG_ENCODING_FIXED))
enc = rb_default_encoding();
rb_enc_associate((VALUE)re, enc);
if (options & ARG_ENCODING_FIXED || raw8bit) {
if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
re->basic.flags |= KCODE_FIXED;
}
re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
options & ARG_REG_OPTION_MASK, err);
if (!re->ptr) return -1;
re->str = ALLOC_N(char, len+1);
memcpy(re->str, s, len);
re->str[len] = '\0';
re->len = len;
RB_GC_GUARD(unescaped);
return 0;
}
static int
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
{
if (!rb_enc_str_asciionly_p(str)) {
options |= ARG_ENCODING_FIXED;
}
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
options, err);
}
@ -2183,6 +2595,10 @@ Init_Regexp(void)
rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
#if 0
rb_define_singleton_method(rb_cRegexp, "preprocess", rb_reg_preprocess_m, 1);
#endif
rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);

View file

@ -25,6 +25,17 @@ class TestM17N < Test::Unit::TestCase
assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding)
end
def test_string_mixed_unicode
assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) }
assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) }
assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) }
assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) }
assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) }
assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) }
assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) }
assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) }
end
def test_regexp_too_short_multibyte_character
assert_raise(SyntaxError) { eval('/\xfe/e') }
assert_raise(SyntaxError) { eval('/\x8e/e') }
@ -38,11 +49,12 @@ class TestM17N < Test::Unit::TestCase
assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit
#assert_raise(SyntaxError) { eval("/\xfe/e") }
#assert_raise(SyntaxError) { eval("/\xc0/u") }
assert_raise(SyntaxError) { eval("/\xfe/e") }
assert_raise(SyntaxError) { eval("/\xc0/u") }
# invalid suffix
#assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
assert_raise(SyntaxError) { eval('/\xc0 /u') }
#assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
end
@ -94,6 +106,9 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_generic
assert_regexp_generic_ascii(/a/)
assert_regexp_generic_ascii(Regexp.new(a("a")))
assert_regexp_generic_ascii(Regexp.new(e("a")))
assert_regexp_generic_ascii(Regexp.new(s("a")))
assert_regexp_generic_ascii(Regexp.new(u("a")))
[/a/, Regexp.new(a("a"))].each {|r|
assert_equal(0, r =~ a("a"))
@ -112,7 +127,7 @@ class TestM17N < Test::Unit::TestCase
assert_regexp_fixed_ascii8bit(/\xc0\xa1/n)
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/})))
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n})))
# assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
[/a/n].each {|r|
assert_equal(0, r =~ a("a"))
@ -139,12 +154,11 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_euc
assert_regexp_fixed_eucjp(/a/e)
assert_regexp_fixed_eucjp(Regexp.new(e("a")))
assert_regexp_fixed_eucjp(/\xc0\xa1/e)
assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/})))
assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/})))
[/a/e, Regexp.new(e("a"))].each {|r|
[/a/e].each {|r|
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
@ -169,7 +183,6 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_sjis
assert_regexp_fixed_sjis(/a/s)
assert_regexp_fixed_sjis(Regexp.new(s("a")))
assert_regexp_fixed_sjis(/\xc0\xa1/s)
assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/})))
assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/})))

View file

@ -68,47 +68,74 @@ EOS
def test_regexp
# Compare regexps to regexps
assert_equal(/Yukihiro Matsumoto - 松本行弘/,
assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
assert_equal(/Yukihiro Matsumoto - 松本行弘/,
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
assert_equal(/Matz - まつもと ゆきひろ/,
assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
assert_not_equal(/Matz - まつもと ゆきひろ/,
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
assert_equal(/Aoyama Gakuin University - 青山学院大学/,
assert_not_equal(/Aoyama Gakuin University - 青山学院大学/,
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
assert_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
assert_equal(/Martin Dürst/, /Martin D\u00FCrst/)
assert_equal(/ü/, /\u00FC/)
assert_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
assert_equal(/ü/, /\u{FC}/)
assert_equal(/ü/, %r{\u{FC}})
assert_equal(/ü/i, %r{\u00FC}i)
assert_not_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
assert_not_equal(/Martin Dürst/, /Martin D\u00FCrst/)
assert_not_equal(/ü/, /\u00FC/)
assert_not_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
assert_not_equal(/ü/, /\u{FC}/)
assert_not_equal(/ü/, %r{\u{FC}})
assert_not_equal(/ü/i, %r{\u00FC}i)
assert_equal('Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18',
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/.source)
assert_equal('Yukihiro Matsumoto - \u{677E 672C 884C 5F18}',
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/.source)
assert_equal('Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D',
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/.source)
assert_equal('Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66',
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
assert_equal('\u9752\u5C71\u5B66\u9662\u5927\u5B66',
/\u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
assert_equal('Martin D\u00FCrst', /Martin D\u00FCrst/.source)
assert_equal('\u00FC', /\u00FC/.source)
assert_equal('Martin D\u{FC}rst', /Martin D\u{FC}rst/.source)
assert_equal('\u{FC}', /\u{FC}/.source)
assert_equal('\u{FC}', %r{\u{FC}}.source)
assert_equal('\u00FC', %r{\u00FC}i.source)
# match strings to regexps
assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/, 0)
assert_equal("Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/, 0)
assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0)
assert_equal(%Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0)
assert_equal("Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/, 0)
assert_equal("Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0)
assert_equal("青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0)
assert_equal("Martin Dürst" =~ /Martin D\u00FCrst/, 0)
assert_equal("ü" =~ /\u00FC/, 0)
assert_equal("Martin Dürst" =~ /Martin D\u{FC}rst/, 0)
assert_equal("ü" =~ %r{\u{FC}}, 0)
assert_equal("ü" =~ %r{\u00FC}i, 0)
assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
assert_equal(0, "Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/)
assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
assert_equal(0, %Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
assert_equal(0, "Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
assert_equal(0, "Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
assert_equal(0, "青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
assert_equal(0, "Martin Dürst" =~ /Martin D\u00FCrst/)
assert_equal(0, "ü" =~ /\u00FC/)
assert_equal(0, "Martin Dürst" =~ /Martin D\u{FC}rst/)
assert_equal(0, "ü" =~ %r{\u{FC}})
assert_equal(0, "ü" =~ %r{\u00FC}i)
# Flip order of the two operands
assert_equal(/Martin D\u00FCrst/ =~ "Martin Dürst", 0)
assert_equal(/\u00FC/ =~ "testü", 4)
assert_equal(/Martin D\u{FC}rst/ =~ "fooMartin Dürstbar", 3)
assert_equal(%r{\u{FC}} =~ "fooübar", 3)
assert_equal(0, /Martin D\u00FCrst/ =~ "Martin Dürst")
assert_equal(4, /\u00FC/ =~ "testü")
assert_equal(3, /Martin D\u{FC}rst/ =~ "fooMartin Dürstbar")
assert_equal(3, %r{\u{FC}} =~ "fooübar")
# Put \u in strings, literal character in regexp
assert_equal("Martin D\u00FCrst" =~ /Martin Dürst/, 0)
assert_equal("test\u00FC" =~ /ü/, 4)
assert_equal("fooMartin D\u{FC}rstbar" =~ /Martin Dürst/, 3)
assert_equal(%Q{foo\u{FC}bar} =~ %r<ü>, 3)
assert_equal(0, "Martin D\u00FCrst" =~ /Martin Dürst/)
assert_equal(4, "test\u00FC" =~ /ü/)
assert_equal(3, "fooMartin D\u{FC}rstbar" =~ /Martin Dürst/)
assert_equal(3, %Q{foo\u{FC}bar} =~ %r<ü>)
assert_match(eval('/\u{2a}/'), "*")
assert_raise(SyntaxError) { eval('/\u{6666}/n') }
assert_raise(SyntaxError) { eval('/\u{6666}/e') }
assert_raise(SyntaxError) { eval('/\u{6666}/s') }
assert_nothing_raised { eval('/\u{6666}/u') }
end
def test_dynamic_regexp
assert_match(Regexp.new("Martin D\\u{FC}rst"), "Martin Dürst")
end
def test_syntax_variants