1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/regparse.c
naruse c11e648799 Regexp supports Unicoe 9.0.0's \X
* meta character \X matches Unicode 9.0.0 characters with some workarounds
  for UTR #51 Unicode Emoji, Version 4.0 emoji zwj sequences.
  [Feature #12831] [ruby-core:77586]

The term "character" can have many meanings bytes, codepoints, combined
characters, and so on. "grapheme cluster" is highest one of such words,
which means user-perceived characters.
Unicode Standard Annex #29 UNICODE TEXT SEGMENTATION specifies how to
handle grapheme clusters (extended grapheme cluster).
But some specs aren't updated to current situation because Unicode Emoji
is rapidly extended without well definition.
It breaks the precondition of UTR#29 "Grapheme cluster boundaries can be
easily tested by looking at immediately adjacent characters". (the
sentence will be removed in the next version)
Though some of its detail are described in Unicode Technical Report #51
UNICODE EMOJI but it is not merged into UTR#29 yet.

http://unicode.org/reports/tr29/
http://unicode.org/reports/tr51/
http://unicode.org/Public/emoji/4.0/

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@56949 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2016-11-30 17:29:19 +00:00

7148 lines
165 KiB
C

/**********************************************************************
regparse.c - Onigmo (Oniguruma-mod) (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* Copyright (c) 2011-2014 K.Takata <kentkt AT csc DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regparse.h"
#define WARN_BUFSIZE 256
#define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
const OnigSyntaxType OnigSyntaxRuby = {
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
ONIG_SYN_OP_ESC_C_CONTROL )
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
, ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
ONIG_SYN_OP2_OPTION_RUBY |
ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
ONIG_SYN_OP2_ESC_H_XDIGIT |
ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP )
, ( SYN_GNU_REGEX_BV |
ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
ONIG_SYN_WARN_CC_DUP |
ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
, ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
ONIG_OPTION_WORD_BOUND_ALL_RANGE )
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
extern void onig_null_warn(const char* s ARG_UNUSED) { }
#ifdef DEFAULT_WARN_FUNCTION
static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
#else
static OnigWarnFunc onig_warn = onig_null_warn;
#endif
#ifdef DEFAULT_VERB_WARN_FUNCTION
static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
#else
static OnigWarnFunc onig_verb_warn = onig_null_warn;
#endif
extern void onig_set_warn_func(OnigWarnFunc f)
{
onig_warn = f;
}
extern void onig_set_verb_warn_func(OnigWarnFunc f)
{
onig_verb_warn = f;
}
static void CC_DUP_WARN(ScanEnv *env);
static void
bbuf_free(BBuf* bbuf)
{
if (IS_NOT_NULL(bbuf)) {
if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
xfree(bbuf);
}
}
static int
bbuf_clone(BBuf** rto, BBuf* from)
{
int r;
BBuf *to;
*rto = to = (BBuf* )xmalloc(sizeof(BBuf));
CHECK_NULL_RETURN_MEMERR(to);
r = BBUF_INIT(to, from->alloc);
if (r != 0) return r;
to->used = from->used;
xmemcpy(to->p, from->p, from->used);
return 0;
}
#define BACKREF_REL_TO_ABS(rel_no, env) \
((env)->num_mem + 1 + (rel_no))
#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
#define MBCODE_START_POS(enc) \
(OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
if (r) return r;\
}\
} while (0)
#define BITSET_SET_BIT_CHKDUP(bs, pos) do { \
if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \
BS_ROOM(bs, pos) |= BS_BIT(pos); \
} while (0)
#define BITSET_IS_EMPTY(bs,empty) do {\
int i;\
empty = 1;\
for (i = 0; i < BITSET_SIZE; i++) {\
if ((bs)[i] != 0) {\
empty = 0; break;\
}\
}\
} while (0)
static void
bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to)
{
int i;
for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
BITSET_SET_BIT_CHKDUP(bs, i);
}
}
#if 0
static void
bitset_set_all(BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
}
#endif
static void
bitset_invert(BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
}
static void
bitset_invert_to(BitSetRef from, BitSetRef to)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); }
}
static void
bitset_and(BitSetRef dest, BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; }
}
static void
bitset_or(BitSetRef dest, BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; }
}
static void
bitset_copy(BitSetRef dest, BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; }
}
extern int
onig_strncmp(const UChar* s1, const UChar* s2, int n)
{
int x;
while (n-- > 0) {
x = *s2++ - *s1++;
if (x) return x;
}
return 0;
}
extern void
onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
{
ptrdiff_t len = end - src;
if (len > 0) {
xmemcpy(dest, src, len);
dest[len] = (UChar )0;
}
}
#ifdef USE_NAMED_GROUP
static UChar*
strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
{
ptrdiff_t slen;
int term_len, i;
UChar *r;
slen = end - s;
term_len = ONIGENC_MBC_MINLEN(enc);
r = (UChar* )xmalloc(slen + term_len);
CHECK_NULL_RETURN(r);
xmemcpy(r, s, slen);
for (i = 0; i < term_len; i++)
r[slen + i] = (UChar )0;
return r;
}
#endif
/* scan pattern methods */
#define PEND_VALUE 0
#ifdef __GNUC__
/* get rid of Wunused-but-set-variable and Wuninitialized */
#define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev
#else
#define PFETCH_READY UChar* pfetch_prev
#endif
#define PEND (p < end ? 0 : 1)
#define PUNFETCH p = pfetch_prev
#define PINC do { \
pfetch_prev = p; \
p += enclen(enc, p, end); \
} while (0)
#define PFETCH(c) do { \
c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
pfetch_prev = p; \
p += enclen(enc, p, end); \
} while (0)
#define PINC_S do { \
p += enclen(enc, p, end); \
} while (0)
#define PFETCH_S(c) do { \
c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
p += enclen(enc, p, end); \
} while (0)
#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
static UChar*
strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
size_t capa)
{
UChar* r;
if (dest)
r = (UChar* )xrealloc(dest, capa + 1);
else
r = (UChar* )xmalloc(capa + 1);
CHECK_NULL_RETURN(r);
onig_strcpy(r + (dest_end - dest), src, src_end);
return r;
}
/* dest on static area */
static UChar*
strcat_capa_from_static(UChar* dest, UChar* dest_end,
const UChar* src, const UChar* src_end, size_t capa)
{
UChar* r;
r = (UChar* )xmalloc(capa + 1);
CHECK_NULL_RETURN(r);
onig_strcpy(r, dest, dest_end);
onig_strcpy(r + (dest_end - dest), src, src_end);
return r;
}
#ifdef USE_ST_LIBRARY
#include "ruby/st.h"
typedef struct {
const UChar* s;
const UChar* end;
} st_str_end_key;
static int
str_end_cmp(st_data_t xp, st_data_t yp)
{
const st_str_end_key *x, *y;
const UChar *p, *q;
int c;
x = (const st_str_end_key *)xp;
y = (const st_str_end_key *)yp;
if ((x->end - x->s) != (y->end - y->s))
return 1;
p = x->s;
q = y->s;
while (p < x->end) {
c = (int )*p - (int )*q;
if (c != 0) return c;
p++; q++;
}
return 0;
}
static st_index_t
str_end_hash(st_data_t xp)
{
const st_str_end_key *x = (const st_str_end_key *)xp;
const UChar *p;
st_index_t val = 0;
p = x->s;
while (p < x->end) {
val = val * 997 + (int )*p++;
}
return val + (val >> 5);
}
extern hash_table_type*
onig_st_init_strend_table_with_size(st_index_t size)
{
static const struct st_hash_type hashType = {
str_end_cmp,
str_end_hash,
};
return (hash_table_type* )
onig_st_init_table_with_size(&hashType, size);
}
extern int
onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
const UChar* end_key, hash_data_type *value)
{
st_str_end_key key;
key.s = (UChar* )str_key;
key.end = (UChar* )end_key;
return onig_st_lookup(table, (st_data_t )(&key), value);
}
extern int
onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
const UChar* end_key, hash_data_type value)
{
st_str_end_key* key;
int result;
key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
key->s = (UChar* )str_key;
key->end = (UChar* )end_key;
result = onig_st_insert(table, (st_data_t )key, value);
if (result) {
xfree(key);
}
return result;
}
#endif /* USE_ST_LIBRARY */
#ifdef USE_NAMED_GROUP
#define INIT_NAME_BACKREFS_ALLOC_NUM 8
typedef struct {
UChar* name;
size_t name_len; /* byte length */
int back_num; /* number of backrefs */
int back_alloc;
int back_ref1;
int* back_refs;
} NameEntry;
#ifdef USE_ST_LIBRARY
typedef st_table NameTable;
typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
#ifdef ONIG_DEBUG
static int
i_print_name_entry(UChar* key, NameEntry* e, void* arg)
{
int i;
FILE* fp = (FILE* )arg;
fprintf(fp, "%s: ", e->name);
if (e->back_num == 0)
fputs("-", fp);
else if (e->back_num == 1)
fprintf(fp, "%d", e->back_ref1);
else {
for (i = 0; i < e->back_num; i++) {
if (i > 0) fprintf(fp, ", ");
fprintf(fp, "%d", e->back_refs[i]);
}
}
fputs("\n", fp);
return ST_CONTINUE;
}
extern int
onig_print_names(FILE* fp, regex_t* reg)
{
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) {
fprintf(fp, "name table\n");
onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
fputs("\n", fp);
}
return 0;
}
#endif /* ONIG_DEBUG */
static int
i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
{
xfree(e->name);
if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
xfree(key);
xfree(e);
return ST_DELETE;
}
static int
names_clear(regex_t* reg)
{
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) {
onig_st_foreach(t, i_free_name_entry, 0);
}
return 0;
}
extern int
onig_names_free(regex_t* reg)
{
int r;
NameTable* t;
r = names_clear(reg);
if (r) return r;
t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) onig_st_free_table(t);
reg->name_table = (void* )NULL;
return 0;
}
static NameEntry*
name_find(regex_t* reg, const UChar* name, const UChar* name_end)
{
NameEntry* e;
NameTable* t = (NameTable* )reg->name_table;
e = (NameEntry* )NULL;
if (IS_NOT_NULL(t)) {
onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
}
return e;
}
typedef struct {
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
regex_t* reg;
void* arg;
int ret;
OnigEncoding enc;
} INamesArg;
static int
i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
{
int r = (*(arg->func))(e->name,
e->name + e->name_len,
e->back_num,
(e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
arg->reg, arg->arg);
if (r != 0) {
arg->ret = r;
return ST_STOP;
}
return ST_CONTINUE;
}
extern int
onig_foreach_name(regex_t* reg,
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
{
INamesArg narg;
NameTable* t = (NameTable* )reg->name_table;
narg.ret = 0;
if (IS_NOT_NULL(t)) {
narg.func = func;
narg.reg = reg;
narg.arg = arg;
narg.enc = reg->enc; /* should be pattern encoding. */
onig_st_foreach(t, i_names, (HashDataType )&narg);
}
return narg.ret;
}
static int
i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
{
int i;
if (e->back_num > 1) {
for (i = 0; i < e->back_num; i++) {
e->back_refs[i] = map[e->back_refs[i]].new_val;
}
}
else if (e->back_num == 1) {
e->back_ref1 = map[e->back_ref1].new_val;
}
return ST_CONTINUE;
}
extern int
onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
{
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) {
onig_st_foreach(t, i_renumber_name, (HashDataType )map);
}
return 0;
}
extern int
onig_number_of_names(regex_t* reg)
{
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t))
return (int )t->num_entries;
else
return 0;
}
#else /* USE_ST_LIBRARY */
#define INIT_NAMES_ALLOC_NUM 8
typedef struct {
NameEntry* e;
int num;
int alloc;
} NameTable;
#ifdef ONIG_DEBUG
extern int
onig_print_names(FILE* fp, regex_t* reg)
{
int i, j;
NameEntry* e;
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t) && t->num > 0) {
fprintf(fp, "name table\n");
for (i = 0; i < t->num; i++) {
e = &(t->e[i]);
fprintf(fp, "%s: ", e->name);
if (e->back_num == 0) {
fputs("-", fp);
}
else if (e->back_num == 1) {
fprintf(fp, "%d", e->back_ref1);
}
else {
for (j = 0; j < e->back_num; j++) {
if (j > 0) fprintf(fp, ", ");
fprintf(fp, "%d", e->back_refs[j]);
}
}
fputs("\n", fp);
}
fputs("\n", fp);
}
return 0;
}
#endif
static int
names_clear(regex_t* reg)
{
int i;
NameEntry* e;
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) {
for (i = 0; i < t->num; i++) {
e = &(t->e[i]);
if (IS_NOT_NULL(e->name)) {
xfree(e->name);
e->name = NULL;
e->name_len = 0;
e->back_num = 0;
e->back_alloc = 0;
if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
e->back_refs = (int* )NULL;
}
}
if (IS_NOT_NULL(t->e)) {
xfree(t->e);
t->e = NULL;
}
t->num = 0;
}
return 0;
}
extern int
onig_names_free(regex_t* reg)
{
int r;
NameTable* t;
r = names_clear(reg);
if (r) return r;
t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) xfree(t);
reg->name_table = NULL;
return 0;
}
static NameEntry*
name_find(regex_t* reg, const UChar* name, const UChar* name_end)
{
int i, len;
NameEntry* e;
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) {
len = name_end - name;
for (i = 0; i < t->num; i++) {
e = &(t->e[i]);
if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
return e;
}
}
return (NameEntry* )NULL;
}
extern int
onig_foreach_name(regex_t* reg,
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
{
int i, r;
NameEntry* e;
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t)) {
for (i = 0; i < t->num; i++) {
e = &(t->e[i]);
r = (*func)(e->name, e->name + e->name_len, e->back_num,
(e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
reg, arg);
if (r != 0) return r;
}
}
return 0;
}
extern int
onig_number_of_names(regex_t* reg)
{
NameTable* t = (NameTable* )reg->name_table;
if (IS_NOT_NULL(t))
return t->num;
else
return 0;
}
#endif /* else USE_ST_LIBRARY */
static int
name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
{
int alloc;
NameEntry* e;
NameTable* t = (NameTable* )reg->name_table;
if (name_end - name <= 0)
return ONIGERR_EMPTY_GROUP_NAME;
e = name_find(reg, name, name_end);
if (IS_NULL(e)) {
#ifdef USE_ST_LIBRARY
if (IS_NULL(t)) {
t = onig_st_init_strend_table_with_size(5);
reg->name_table = (void* )t;
}
e = (NameEntry* )xmalloc(sizeof(NameEntry));
CHECK_NULL_RETURN_MEMERR(e);
e->name = strdup_with_null(reg->enc, name, name_end);
if (IS_NULL(e->name)) {
xfree(e);
return ONIGERR_MEMORY;
}
onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
(HashDataType )e);
e->name_len = name_end - name;
e->back_num = 0;
e->back_alloc = 0;
e->back_refs = (int* )NULL;
#else
if (IS_NULL(t)) {
alloc = INIT_NAMES_ALLOC_NUM;
t = (NameTable* )xmalloc(sizeof(NameTable));
CHECK_NULL_RETURN_MEMERR(t);
t->e = NULL;
t->alloc = 0;
t->num = 0;
t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
if (IS_NULL(t->e)) {
xfree(t);
return ONIGERR_MEMORY;
}
t->alloc = alloc;
reg->name_table = t;
goto clear;
}
else if (t->num == t->alloc) {
int i;
NameEntry* p;
alloc = t->alloc * 2;
p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
CHECK_NULL_RETURN_MEMERR(p);
t->e = p;
t->alloc = alloc;
clear:
for (i = t->num; i < t->alloc; i++) {
t->e[i].name = NULL;
t->e[i].name_len = 0;
t->e[i].back_num = 0;
t->e[i].back_alloc = 0;
t->e[i].back_refs = (int* )NULL;
}
}
e = &(t->e[t->num]);
t->num++;
e->name = strdup_with_null(reg->enc, name, name_end);
if (IS_NULL(e->name)) return ONIGERR_MEMORY;
e->name_len = name_end - name;
#endif
}
if (e->back_num >= 1 &&
! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
name, name_end);
return ONIGERR_MULTIPLEX_DEFINED_NAME;
}
e->back_num++;
if (e->back_num == 1) {
e->back_ref1 = backref;
}
else {
if (e->back_num == 2) {
alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
CHECK_NULL_RETURN_MEMERR(e->back_refs);
e->back_alloc = alloc;
e->back_refs[0] = e->back_ref1;
e->back_refs[1] = backref;
}
else {
if (e->back_num > e->back_alloc) {
int* p;
alloc = e->back_alloc * 2;
p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
CHECK_NULL_RETURN_MEMERR(p);
e->back_refs = p;
e->back_alloc = alloc;
}
e->back_refs[e->back_num - 1] = backref;
}
}
return 0;
}
extern int
onig_name_to_group_numbers(regex_t* reg, const UChar* name,
const UChar* name_end, int** nums)
{
NameEntry* e = name_find(reg, name, name_end);
if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
switch (e->back_num) {
case 0:
*nums = 0;
break;
case 1:
*nums = &(e->back_ref1);
break;
default:
*nums = e->back_refs;
break;
}
return e->back_num;
}
extern int
onig_name_to_backref_number(regex_t* reg, const UChar* name,
const UChar* name_end, OnigRegion *region)
{
int i, n, *nums;
n = onig_name_to_group_numbers(reg, name, name_end, &nums);
if (n < 0)
return n;
else if (n == 0)
return ONIGERR_PARSER_BUG;
else if (n == 1)
return nums[0];
else {
if (IS_NOT_NULL(region)) {
for (i = n - 1; i >= 0; i--) {
if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
return nums[i];
}
}
return nums[n - 1];
}
}
#else /* USE_NAMED_GROUP */
extern int
onig_name_to_group_numbers(regex_t* reg, const UChar* name,
const UChar* name_end, int** nums)
{
return ONIG_NO_SUPPORT_CONFIG;
}
extern int
onig_name_to_backref_number(regex_t* reg, const UChar* name,
const UChar* name_end, OnigRegion* region)
{
return ONIG_NO_SUPPORT_CONFIG;
}
extern int
onig_foreach_name(regex_t* reg,
int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
{
return ONIG_NO_SUPPORT_CONFIG;
}
extern int
onig_number_of_names(regex_t* reg)
{
return 0;
}
#endif /* else USE_NAMED_GROUP */
extern int
onig_noname_group_capture_is_active(regex_t* reg)
{
if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
return 0;
#ifdef USE_NAMED_GROUP
if (onig_number_of_names(reg) > 0 &&
IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
!ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
return 0;
}
#endif
return 1;
}
#define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
static void
scan_env_clear(ScanEnv* env)
{
int i;
BIT_STATUS_CLEAR(env->capture_history);
BIT_STATUS_CLEAR(env->bt_mem_start);
BIT_STATUS_CLEAR(env->bt_mem_end);
BIT_STATUS_CLEAR(env->backrefed_mem);
env->error = (UChar* )NULL;
env->error_end = (UChar* )NULL;
env->num_call = 0;
env->num_mem = 0;
#ifdef USE_NAMED_GROUP
env->num_named = 0;
#endif
env->mem_alloc = 0;
env->mem_nodes_dynamic = (Node** )NULL;
for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
env->mem_nodes_static[i] = NULL_NODE;
#ifdef USE_COMBINATION_EXPLOSION_CHECK
env->num_comb_exp_check = 0;
env->comb_exp_max_regnum = 0;
env->curr_max_regnum = 0;
env->has_recursion = 0;
#endif
env->warnings_flag = 0;
}
static int
scan_env_add_mem_entry(ScanEnv* env)
{
int i, need, alloc;
Node** p;
need = env->num_mem + 1;
if (need > ONIG_MAX_CAPTURE_GROUP_NUM)
return ONIGERR_TOO_MANY_CAPTURE_GROUPS;
if (need >= SCANENV_MEMNODES_SIZE) {
if (env->mem_alloc <= need) {
if (IS_NULL(env->mem_nodes_dynamic)) {
alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
p = (Node** )xmalloc(sizeof(Node*) * alloc);
xmemcpy(p, env->mem_nodes_static,
sizeof(Node*) * SCANENV_MEMNODES_SIZE);
}
else {
alloc = env->mem_alloc * 2;
p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
}
CHECK_NULL_RETURN_MEMERR(p);
for (i = env->num_mem + 1; i < alloc; i++)
p[i] = NULL_NODE;
env->mem_nodes_dynamic = p;
env->mem_alloc = alloc;
}
}
env->num_mem++;
return env->num_mem;
}
static int
scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
{
if (env->num_mem >= num)
SCANENV_MEM_NODES(env)[num] = node;
else
return ONIGERR_PARSER_BUG;
return 0;
}
#ifdef USE_PARSE_TREE_NODE_RECYCLE
typedef struct _FreeNode {
struct _FreeNode* next;
} FreeNode;
static FreeNode* FreeNodeList = (FreeNode* )NULL;
#endif
extern void
onig_node_free(Node* node)
{
start:
if (IS_NULL(node)) return ;
switch (NTYPE(node)) {
case NT_STR:
if (NSTR(node)->capa != 0 &&
IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
xfree(NSTR(node)->s);
}
break;
case NT_LIST:
case NT_ALT:
onig_node_free(NCAR(node));
{
Node* next_node = NCDR(node);
#ifdef USE_PARSE_TREE_NODE_RECYCLE
{
FreeNode* n = (FreeNode* )node;
THREAD_ATOMIC_START;
n->next = FreeNodeList;
FreeNodeList = n;
THREAD_ATOMIC_END;
}
#else
xfree(node);
#endif
node = next_node;
goto start;
}
break;
case NT_CCLASS:
{
CClassNode* cc = NCCLASS(node);
if (IS_NCCLASS_SHARE(cc)) return ;
if (cc->mbuf)
bbuf_free(cc->mbuf);
}
break;
case NT_QTFR:
if (NQTFR(node)->target)
onig_node_free(NQTFR(node)->target);
break;
case NT_ENCLOSE:
if (NENCLOSE(node)->target)
onig_node_free(NENCLOSE(node)->target);
break;
case NT_BREF:
if (IS_NOT_NULL(NBREF(node)->back_dynamic))
xfree(NBREF(node)->back_dynamic);
break;
case NT_ANCHOR:
if (NANCHOR(node)->target)
onig_node_free(NANCHOR(node)->target);
break;
}
#ifdef USE_PARSE_TREE_NODE_RECYCLE
{
FreeNode* n = (FreeNode* )node;
THREAD_ATOMIC_START;
n->next = FreeNodeList;
FreeNodeList = n;
THREAD_ATOMIC_END;
}
#else
xfree(node);
#endif
}
#ifdef USE_PARSE_TREE_NODE_RECYCLE
extern int
onig_free_node_list(void)
{
FreeNode* n;
/* THREAD_ATOMIC_START; */
while (IS_NOT_NULL(FreeNodeList)) {
n = FreeNodeList;
FreeNodeList = FreeNodeList->next;
xfree(n);
}
/* THREAD_ATOMIC_END; */
return 0;
}
#endif
static Node*
node_new(void)
{
Node* node;
#ifdef USE_PARSE_TREE_NODE_RECYCLE
THREAD_ATOMIC_START;
if (IS_NOT_NULL(FreeNodeList)) {
node = (Node* )FreeNodeList;
FreeNodeList = FreeNodeList->next;
THREAD_ATOMIC_END;
return node;
}
THREAD_ATOMIC_END;
#endif
node = (Node* )xmalloc(sizeof(Node));
/* xmemset(node, 0, sizeof(Node)); */
return node;
}
#if defined(USE_MULTI_THREAD_SYSTEM) && \
defined(USE_SHARED_CCLASS_TABLE) && \
defined(USE_PARSE_TREE_NODE_RECYCLE)
static Node*
node_new_locked(void)
{
Node* node;
if (IS_NOT_NULL(FreeNodeList)) {
node = (Node* )FreeNodeList;
FreeNodeList = FreeNodeList->next;
return node;
}
node = (Node* )xmalloc(sizeof(Node));
/* xmemset(node, 0, sizeof(Node)); */
return node;
}
#endif
static void
initialize_cclass(CClassNode* cc)
{
BITSET_CLEAR(cc->bs);
/* cc->base.flags = 0; */
cc->flags = 0;
cc->mbuf = NULL;
}
static Node*
node_new_cclass(void)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_CCLASS);
initialize_cclass(NCCLASS(node));
return node;
}
#if defined(USE_MULTI_THREAD_SYSTEM) && \
defined(USE_SHARED_CCLASS_TABLE) && \
defined(USE_PARSE_TREE_NODE_RECYCLE)
static Node*
node_new_cclass_locked(void)
{
Node* node = node_new_locked();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_CCLASS);
initialize_cclass(NCCLASS(node));
return node;
}
#else
#define node_new_cclass_locked() node_new_cclass()
#endif
#ifdef USE_SHARED_CCLASS_TABLE
static Node*
node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
const OnigCodePoint ranges[])
{
int n, i;
CClassNode* cc;
OnigCodePoint j;
Node* node = node_new_cclass_locked();
CHECK_NULL_RETURN(node);
cc = NCCLASS(node);
if (not != 0) NCCLASS_SET_NOT(cc);
BITSET_CLEAR(cc->bs);
if (sb_out > 0 && IS_NOT_NULL(ranges)) {
n = ONIGENC_CODE_RANGE_NUM(ranges);
for (i = 0; i < n; i++) {
for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
if (j >= sb_out) goto sb_end;
BITSET_SET_BIT(cc->bs, j);
}
}
}
sb_end:
if (IS_NULL(ranges)) {
is_null:
cc->mbuf = NULL;
}
else {
BBuf* bbuf;
n = ONIGENC_CODE_RANGE_NUM(ranges);
if (n == 0) goto is_null;
bbuf = (BBuf* )xmalloc(sizeof(BBuf));
CHECK_NULL_RETURN(bbuf);
bbuf->alloc = n + 1;
bbuf->used = n + 1;
bbuf->p = (UChar* )((void* )ranges);
cc->mbuf = bbuf;
}
return node;
}
#endif /* USE_SHARED_CCLASS_TABLE */
static Node*
node_new_ctype(int type, int not, int ascii_range)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_CTYPE);
NCTYPE(node)->ctype = type;
NCTYPE(node)->not = not;
NCTYPE(node)->ascii_range = ascii_range;
return node;
}
static Node*
node_new_anychar(void)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_CANY);
return node;
}
static Node*
node_new_list(Node* left, Node* right)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_LIST);
NCAR(node) = left;
NCDR(node) = right;
return node;
}
extern Node*
onig_node_new_list(Node* left, Node* right)
{
return node_new_list(left, right);
}
extern Node*
onig_node_list_add(Node* list, Node* x)
{
Node *n;
n = onig_node_new_list(x, NULL);
if (IS_NULL(n)) return NULL_NODE;
if (IS_NOT_NULL(list)) {
while (IS_NOT_NULL(NCDR(list)))
list = NCDR(list);
NCDR(list) = n;
}
return n;
}
extern Node*
onig_node_new_alt(Node* left, Node* right)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_ALT);
NCAR(node) = left;
NCDR(node) = right;
return node;
}
extern Node*
onig_node_new_anchor(int type)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_ANCHOR);
NANCHOR(node)->type = type;
NANCHOR(node)->target = NULL;
NANCHOR(node)->char_len = -1;
NANCHOR(node)->ascii_range = 0;
return node;
}
static Node*
node_new_backref(int back_num, int* backrefs, int by_name,
#ifdef USE_BACKREF_WITH_LEVEL
int exist_level, int nest_level,
#endif
ScanEnv* env)
{
int i;
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_BREF);
NBREF(node)->state = 0;
NBREF(node)->back_num = back_num;
NBREF(node)->back_dynamic = (int* )NULL;
if (by_name != 0)
NBREF(node)->state |= NST_NAME_REF;
#ifdef USE_BACKREF_WITH_LEVEL
if (exist_level != 0) {
NBREF(node)->state |= NST_NEST_LEVEL;
NBREF(node)->nest_level = nest_level;
}
#endif
for (i = 0; i < back_num; i++) {
if (backrefs[i] <= env->num_mem &&
IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
break;
}
}
if (back_num <= NODE_BACKREFS_SIZE) {
for (i = 0; i < back_num; i++)
NBREF(node)->back_static[i] = backrefs[i];
}
else {
int* p = (int* )xmalloc(sizeof(int) * back_num);
if (IS_NULL(p)) {
onig_node_free(node);
return NULL;
}
NBREF(node)->back_dynamic = p;
for (i = 0; i < back_num; i++)
p[i] = backrefs[i];
}
return node;
}
#ifdef USE_SUBEXP_CALL
static Node*
node_new_call(UChar* name, UChar* name_end, int gnum)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_CALL);
NCALL(node)->state = 0;
NCALL(node)->target = NULL_NODE;
NCALL(node)->name = name;
NCALL(node)->name_end = name_end;
NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
return node;
}
#endif
static Node*
node_new_quantifier(int lower, int upper, int by_number)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_QTFR);
NQTFR(node)->state = 0;
NQTFR(node)->target = NULL;
NQTFR(node)->lower = lower;
NQTFR(node)->upper = upper;
NQTFR(node)->greedy = 1;
NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
NQTFR(node)->head_exact = NULL_NODE;
NQTFR(node)->next_head_exact = NULL_NODE;
NQTFR(node)->is_refered = 0;
if (by_number != 0)
NQTFR(node)->state |= NST_BY_NUMBER;
#ifdef USE_COMBINATION_EXPLOSION_CHECK
NQTFR(node)->comb_exp_check_num = 0;
#endif
return node;
}
static Node*
node_new_enclose(int type)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_ENCLOSE);
NENCLOSE(node)->type = type;
NENCLOSE(node)->state = 0;
NENCLOSE(node)->regnum = 0;
NENCLOSE(node)->option = 0;
NENCLOSE(node)->target = NULL;
NENCLOSE(node)->call_addr = -1;
NENCLOSE(node)->opt_count = 0;
return node;
}
extern Node*
onig_node_new_enclose(int type)
{
return node_new_enclose(type);
}
static Node*
node_new_enclose_memory(OnigOptionType option, int is_named)
{
Node* node = node_new_enclose(ENCLOSE_MEMORY);
CHECK_NULL_RETURN(node);
if (is_named != 0)
SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
#ifdef USE_SUBEXP_CALL
NENCLOSE(node)->option = option;
#endif
return node;
}
static Node*
node_new_option(OnigOptionType option)
{
Node* node = node_new_enclose(ENCLOSE_OPTION);
CHECK_NULL_RETURN(node);
NENCLOSE(node)->option = option;
return node;
}
extern int
onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
{
ptrdiff_t addlen = end - s;
if (addlen > 0) {
ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
UChar* p;
ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
if (capa <= NSTR(node)->capa) {
onig_strcpy(NSTR(node)->s + len, s, end);
}
else {
if (NSTR(node)->s == NSTR(node)->buf)
p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
s, end, capa);
else
p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
CHECK_NULL_RETURN_MEMERR(p);
NSTR(node)->s = p;
NSTR(node)->capa = (int )capa;
}
}
else {
onig_strcpy(NSTR(node)->s + len, s, end);
}
NSTR(node)->end = NSTR(node)->s + len + addlen;
}
return 0;
}
extern int
onig_node_str_set(Node* node, const UChar* s, const UChar* end)
{
onig_node_str_clear(node);
return onig_node_str_cat(node, s, end);
}
static int
node_str_cat_char(Node* node, UChar c)
{
UChar s[1];
s[0] = c;
return onig_node_str_cat(node, s, s + 1);
}
static int
node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
{
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
if (num < 0) return num;
return onig_node_str_cat(node, buf, buf + num);
}
extern void
onig_node_conv_to_str_node(Node* node, int flag)
{
SET_NTYPE(node, NT_STR);
NSTR(node)->flag = flag;
NSTR(node)->capa = 0;
NSTR(node)->s = NSTR(node)->buf;
NSTR(node)->end = NSTR(node)->buf;
}
extern void
onig_node_str_clear(Node* node)
{
if (NSTR(node)->capa != 0 &&
IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
xfree(NSTR(node)->s);
}
NSTR(node)->capa = 0;
NSTR(node)->flag = 0;
NSTR(node)->s = NSTR(node)->buf;
NSTR(node)->end = NSTR(node)->buf;
}
static Node*
node_new_str(const UChar* s, const UChar* end)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
SET_NTYPE(node, NT_STR);
NSTR(node)->capa = 0;
NSTR(node)->flag = 0;
NSTR(node)->s = NSTR(node)->buf;
NSTR(node)->end = NSTR(node)->buf;
if (onig_node_str_cat(node, s, end)) {
onig_node_free(node);
return NULL;
}
return node;
}
extern Node*
onig_node_new_str(const UChar* s, const UChar* end)
{
return node_new_str(s, end);
}
static Node*
node_new_str_raw(UChar* s, UChar* end)
{
Node* node = node_new_str(s, end);
if (IS_NOT_NULL(node))
NSTRING_SET_RAW(node);
return node;
}
static Node*
node_new_empty(void)
{
return node_new_str(NULL, NULL);
}
static Node*
node_new_str_raw_char(UChar c)
{
UChar p[1];
p[0] = c;
return node_new_str_raw(p, p + 1);
}
static Node*
str_node_split_last_char(StrNode* sn, OnigEncoding enc)
{
const UChar *p;
Node* n = NULL_NODE;
if (sn->end > sn->s) {
p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
if (p && p > sn->s) { /* can be split. */
n = node_new_str(p, sn->end);
if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
NSTRING_SET_RAW(n);
sn->end = (UChar* )p;
}
}
return n;
}
static int
str_node_can_be_split(StrNode* sn, OnigEncoding enc)
{
if (sn->end > sn->s) {
return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0);
}
return 0;
}
#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
static int
node_str_head_pad(StrNode* sn, int num, UChar val)
{
UChar buf[NODE_STR_BUF_SIZE];
int i, len;
len = sn->end - sn->s;
onig_strcpy(buf, sn->s, sn->end);
onig_strcpy(&(sn->s[num]), buf, buf + len);
sn->end += num;
for (i = 0; i < num; i++) {
sn->s[i] = val;
}
}
#endif
extern int
onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
{
unsigned int num, val;
OnigCodePoint c;
UChar* p = *src;
PFETCH_READY;
num = 0;
while (!PEND) {
PFETCH(c);
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
val = (unsigned int )DIGITVAL(c);
if ((INT_MAX_LIMIT - val) / 10UL < num)
return -1; /* overflow */
num = num * 10 + val;
}
else {
PUNFETCH;
break;
}
}
*src = p;
return num;
}
static int
scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
int maxlen, OnigEncoding enc)
{
OnigCodePoint c;
unsigned int num, val;
int restlen;
UChar* p = *src;
PFETCH_READY;
restlen = maxlen - minlen;
num = 0;
while (!PEND && maxlen-- != 0) {
PFETCH(c);
if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
val = (unsigned int )XDIGITVAL(enc,c);
if ((INT_MAX_LIMIT - val) / 16UL < num)
return -1; /* overflow */
num = (num << 4) + XDIGITVAL(enc,c);
}
else {
PUNFETCH;
break;
}
}
if (maxlen > restlen)
return -2; /* not enough digits */
*src = p;
return num;
}
static int
scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
OnigEncoding enc)
{
OnigCodePoint c;
unsigned int num, val;
UChar* p = *src;
PFETCH_READY;
num = 0;
while (!PEND && maxlen-- != 0) {
PFETCH(c);
if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
val = ODIGITVAL(c);
if ((INT_MAX_LIMIT - val) / 8UL < num)
return -1; /* overflow */
num = (num << 3) + val;
}
else {
PUNFETCH;
break;
}
}
*src = p;
return num;
}
#define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
/* data format:
[n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
(all data size is OnigCodePoint)
*/
static int
new_code_range(BBuf** pbuf)
{
#define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
int r;
OnigCodePoint n;
BBuf* bbuf;
bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
CHECK_NULL_RETURN_MEMERR(*pbuf);
r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
if (r) return r;
n = 0;
BBUF_WRITE_CODE_POINT(bbuf, 0, n);
return 0;
}
static int
add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to,
int checkdup)
{
int r, inc_n, pos;
OnigCodePoint low, high, bound, x;
OnigCodePoint n, *data;
BBuf* bbuf;
if (from > to) {
n = from; from = to; to = n;
}
if (IS_NULL(*pbuf)) {
r = new_code_range(pbuf);
if (r) return r;
bbuf = *pbuf;
n = 0;
}
else {
bbuf = *pbuf;
GET_CODE_POINT(n, bbuf->p);
}
data = (OnigCodePoint* )(bbuf->p);
data++;
bound = (from == 0) ? 0 : n;
for (low = 0; low < bound; ) {
x = (low + bound) >> 1;
if (from - 1 > data[x*2 + 1])
low = x + 1;
else
bound = x;
}
high = (to == ONIG_LAST_CODE_POINT) ? n : low;
for (bound = n; high < bound; ) {
x = (high + bound) >> 1;
if (to + 1 >= data[x*2])
high = x + 1;
else
bound = x;
}
/* data[(low-1)*2+1] << from <= data[low*2]
* data[(high-1)*2+1] <= to << data[high*2]
*/
inc_n = low + 1 - high;
if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
if (inc_n != 1) {
if (checkdup && from <= data[low*2+1]
&& (data[low*2] <= from || data[low*2+1] <= to))
CC_DUP_WARN(env);
if (from > data[low*2])
from = data[low*2];
if (to < data[(high - 1)*2 + 1])
to = data[(high - 1)*2 + 1];
}
if (inc_n != 0) {
int from_pos = SIZE_CODE_POINT * (1 + high * 2);
int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
if (inc_n > 0) {
if (high < n) {
int size = (n - high) * 2 * SIZE_CODE_POINT;
BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
}
}
else {
BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
}
}
pos = SIZE_CODE_POINT * (1 + low * 2);
BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
BBUF_WRITE_CODE_POINT(bbuf, pos, from);
BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
n += inc_n;
BBUF_WRITE_CODE_POINT(bbuf, 0, n);
return 0;
}
static int
add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
{
return add_code_range_to_buf0(pbuf, env, from, to, 1);
}
static int
add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup)
{
if (from > to) {
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
return 0;
else
return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
}
return add_code_range_to_buf0(pbuf, env, from, to, checkdup);
}
static int
add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
{
return add_code_range0(pbuf, env, from, to, 1);
}
static int
not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
{
int r, i, n;
OnigCodePoint pre, from, *data, to = 0;
*pbuf = (BBuf* )NULL;
if (IS_NULL(bbuf)) {
set_all:
return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
}
data = (OnigCodePoint* )(bbuf->p);
GET_CODE_POINT(n, data);
data++;
if (n <= 0) goto set_all;
r = 0;
pre = MBCODE_START_POS(enc);
for (i = 0; i < n; i++) {
from = data[i*2];
to = data[i*2+1];
if (pre <= from - 1) {
r = add_code_range_to_buf(pbuf, env, pre, from - 1);
if (r != 0) return r;
}
if (to == ONIG_LAST_CODE_POINT) break;
pre = to + 1;
}
if (to < ONIG_LAST_CODE_POINT) {
r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT);
}
return r;
}
#define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
BBuf *tbuf; \
int tnot; \
tnot = not1; not1 = not2; not2 = tnot; \
tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
} while (0)
static int
or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
{
int r;
OnigCodePoint i, n1, *data1;
OnigCodePoint from, to;
*pbuf = (BBuf* )NULL;
if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
if (not1 != 0 || not2 != 0)
return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
return 0;
}
r = 0;
if (IS_NULL(bbuf2))
SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
if (IS_NULL(bbuf1)) {
if (not1 != 0) {
return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
}
else {
if (not2 == 0) {
return bbuf_clone(pbuf, bbuf2);
}
else {
return not_code_range_buf(enc, bbuf2, pbuf, env);
}
}
}
if (not1 != 0)
SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
data1 = (OnigCodePoint* )(bbuf1->p);
GET_CODE_POINT(n1, data1);
data1++;
if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
r = bbuf_clone(pbuf, bbuf2);
}
else if (not1 == 0) { /* 1 OR (not 2) */
r = not_code_range_buf(enc, bbuf2, pbuf, env);
}
if (r != 0) return r;
for (i = 0; i < n1; i++) {
from = data1[i*2];
to = data1[i*2+1];
r = add_code_range_to_buf(pbuf, env, from, to);
if (r != 0) return r;
}
return 0;
}
static int
and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1,
OnigCodePoint* data, int n)
{
int i, r;
OnigCodePoint from2, to2;
for (i = 0; i < n; i++) {
from2 = data[i*2];
to2 = data[i*2+1];
if (from2 < from1) {
if (to2 < from1) continue;
else {
from1 = to2 + 1;
}
}
else if (from2 <= to1) {
if (to2 < to1) {
if (from1 <= from2 - 1) {
r = add_code_range_to_buf(pbuf, env, from1, from2-1);
if (r != 0) return r;
}
from1 = to2 + 1;
}
else {
to1 = from2 - 1;
}
}
else {
from1 = from2;
}
if (from1 > to1) break;
}
if (from1 <= to1) {
r = add_code_range_to_buf(pbuf, env, from1, to1);
if (r != 0) return r;
}
return 0;
}
static int
and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
{
int r;
OnigCodePoint i, j, n1, n2, *data1, *data2;
OnigCodePoint from, to, from1, to1, from2, to2;
*pbuf = (BBuf* )NULL;
if (IS_NULL(bbuf1)) {
if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
return bbuf_clone(pbuf, bbuf2);
return 0;
}
else if (IS_NULL(bbuf2)) {
if (not2 != 0)
return bbuf_clone(pbuf, bbuf1);
return 0;
}
if (not1 != 0)
SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
data1 = (OnigCodePoint* )(bbuf1->p);
data2 = (OnigCodePoint* )(bbuf2->p);
GET_CODE_POINT(n1, data1);
GET_CODE_POINT(n2, data2);
data1++;
data2++;
if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
for (i = 0; i < n1; i++) {
from1 = data1[i*2];
to1 = data1[i*2+1];
for (j = 0; j < n2; j++) {
from2 = data2[j*2];
to2 = data2[j*2+1];
if (from2 > to1) break;
if (to2 < from1) continue;
from = MAX(from1, from2);
to = MIN(to1, to2);
r = add_code_range_to_buf(pbuf, env, from, to);
if (r != 0) return r;
}
}
}
else if (not1 == 0) { /* 1 AND (not 2) */
for (i = 0; i < n1; i++) {
from1 = data1[i*2];
to1 = data1[i*2+1];
r = and_code_range1(pbuf, env, from1, to1, data2, n2);
if (r != 0) return r;
}
}
return 0;
}
static int
and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
{
OnigEncoding enc = env->enc;
int r, not1, not2;
BBuf *buf1, *buf2, *pbuf = 0;
BitSetRef bsr1, bsr2;
BitSet bs1, bs2;
not1 = IS_NCCLASS_NOT(dest);
bsr1 = dest->bs;
buf1 = dest->mbuf;
not2 = IS_NCCLASS_NOT(cc);
bsr2 = cc->bs;
buf2 = cc->mbuf;
if (not1 != 0) {
bitset_invert_to(bsr1, bs1);
bsr1 = bs1;
}
if (not2 != 0) {
bitset_invert_to(bsr2, bs2);
bsr2 = bs2;
}
bitset_and(bsr1, bsr2);
if (bsr1 != dest->bs) {
bitset_copy(dest->bs, bsr1);
bsr1 = dest->bs;
}
if (not1 != 0) {
bitset_invert(dest->bs);
}
if (! ONIGENC_IS_SINGLEBYTE(enc)) {
if (not1 != 0 && not2 != 0) {
r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env);
}
else {
r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env);
if (r == 0 && not1 != 0) {
BBuf *tbuf = 0;
r = not_code_range_buf(enc, pbuf, &tbuf, env);
bbuf_free(pbuf);
pbuf = tbuf;
}
}
if (r != 0) {
bbuf_free(pbuf);
return r;
}
dest->mbuf = pbuf;
bbuf_free(buf1);
return r;
}
return 0;
}
static int
or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
{
OnigEncoding enc = env->enc;
int r, not1, not2;
BBuf *buf1, *buf2, *pbuf = 0;
BitSetRef bsr1, bsr2;
BitSet bs1, bs2;
not1 = IS_NCCLASS_NOT(dest);
bsr1 = dest->bs;
buf1 = dest->mbuf;
not2 = IS_NCCLASS_NOT(cc);
bsr2 = cc->bs;
buf2 = cc->mbuf;
if (not1 != 0) {
bitset_invert_to(bsr1, bs1);
bsr1 = bs1;
}
if (not2 != 0) {
bitset_invert_to(bsr2, bs2);
bsr2 = bs2;
}
bitset_or(bsr1, bsr2);
if (bsr1 != dest->bs) {
bitset_copy(dest->bs, bsr1);
bsr1 = dest->bs;
}
if (not1 != 0) {
bitset_invert(dest->bs);
}
if (! ONIGENC_IS_SINGLEBYTE(enc)) {
if (not1 != 0 && not2 != 0) {
r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env);
}
else {
r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env);
if (r == 0 && not1 != 0) {
BBuf *tbuf = 0;
r = not_code_range_buf(enc, pbuf, &tbuf, env);
bbuf_free(pbuf);
pbuf = tbuf;
}
}
if (r != 0) {
bbuf_free(pbuf);
return r;
}
dest->mbuf = pbuf;
bbuf_free(buf1);
return r;
}
else
return 0;
}
static void UNKNOWN_ESC_WARN(ScanEnv *env, int c);
static int
conv_backslash_value(int c, ScanEnv* env)
{
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
switch (c) {
case 'n': return '\n';
case 't': return '\t';
case 'r': return '\r';
case 'f': return '\f';
case 'a': return '\007';
case 'b': return '\010';
case 'e': return '\033';
case 'v':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
return '\v';
break;
default:
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
UNKNOWN_ESC_WARN(env, c);
break;
}
}
return c;
}
#ifdef USE_NO_INVALID_QUANTIFIER
#define is_invalid_quantifier_target(node) 0
#else
static int
is_invalid_quantifier_target(Node* node)
{
switch (NTYPE(node)) {
case NT_ANCHOR:
return 1;
break;
case NT_ENCLOSE:
/* allow enclosed elements */
/* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
break;
case NT_LIST:
do {
if (! is_invalid_quantifier_target(NCAR(node))) return 0;
} while (IS_NOT_NULL(node = NCDR(node)));
return 0;
break;
case NT_ALT:
do {
if (is_invalid_quantifier_target(NCAR(node))) return 1;
} while (IS_NOT_NULL(node = NCDR(node)));
break;
default:
break;
}
return 0;
}
#endif
/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
static int
popular_quantifier_num(QtfrNode* q)
{
if (q->greedy) {
if (q->lower == 0) {
if (q->upper == 1) return 0;
else if (IS_REPEAT_INFINITE(q->upper)) return 1;
}
else if (q->lower == 1) {
if (IS_REPEAT_INFINITE(q->upper)) return 2;
}
}
else {
if (q->lower == 0) {
if (q->upper == 1) return 3;
else if (IS_REPEAT_INFINITE(q->upper)) return 4;
}
else if (q->lower == 1) {
if (IS_REPEAT_INFINITE(q->upper)) return 5;
}
}
return -1;
}
enum ReduceType {
RQ_ASIS = 0, /* as is */
RQ_DEL = 1, /* delete parent */
RQ_A, /* to '*' */
RQ_AQ, /* to '*?' */
RQ_QQ, /* to '??' */
RQ_P_QQ, /* to '+)??' */
RQ_PQ_Q /* to '+?)?' */
};
static enum ReduceType const ReduceTypeTable[6][6] = {
{RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
{RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
{RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
{RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
{RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
{RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
};
extern void
onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
{
int pnum, cnum;
QtfrNode *p, *c;
p = NQTFR(pnode);
c = NQTFR(cnode);
pnum = popular_quantifier_num(p);
cnum = popular_quantifier_num(c);
if (pnum < 0 || cnum < 0) return ;
switch (ReduceTypeTable[cnum][pnum]) {
case RQ_DEL:
*pnode = *cnode;
break;
case RQ_A:
p->target = c->target;
p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
break;
case RQ_AQ:
p->target = c->target;
p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
break;
case RQ_QQ:
p->target = c->target;
p->lower = 0; p->upper = 1; p->greedy = 0;
break;
case RQ_P_QQ:
p->target = cnode;
p->lower = 0; p->upper = 1; p->greedy = 0;
c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
return ;
break;
case RQ_PQ_Q:
p->target = cnode;
p->lower = 0; p->upper = 1; p->greedy = 1;
c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
return ;
break;
case RQ_ASIS:
p->target = cnode;
return ;
break;
}
c->target = NULL_NODE;
onig_node_free(cnode);
}
enum TokenSyms {
TK_EOT = 0, /* end of token */
TK_RAW_BYTE = 1,
TK_CHAR,
TK_STRING,
TK_CODE_POINT,
TK_ANYCHAR,
TK_CHAR_TYPE,
TK_BACKREF,
TK_CALL,
TK_ANCHOR,
TK_OP_REPEAT,
TK_INTERVAL,
TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
TK_ALT,
TK_SUBEXP_OPEN,
TK_SUBEXP_CLOSE,
TK_CC_OPEN,
TK_QUOTE_OPEN,
TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
TK_LINEBREAK,
TK_EXTENDED_GRAPHEME_CLUSTER,
TK_KEEP,
/* in cc */
TK_CC_CLOSE,
TK_CC_RANGE,
TK_POSIX_BRACKET_OPEN,
TK_CC_AND, /* && */
TK_CC_CC_OPEN /* [ */
};
typedef struct {
enum TokenSyms type;
int escaped;
int base; /* is number: 8, 16 (used in [....]) */
UChar* backp;
union {
UChar* s;
int c;
OnigCodePoint code;
struct {
int subtype;
int ascii_range;
} anchor;
struct {
int lower;
int upper;
int greedy;
int possessive;
} repeat;
struct {
int num;
int ref1;
int* refs;
int by_name;
#ifdef USE_BACKREF_WITH_LEVEL
int exist_level;
int level; /* \k<name+n> */
#endif
} backref;
struct {
UChar* name;
UChar* name_end;
int gnum;
int rel;
} call;
struct {
int ctype;
int not;
} prop;
} u;
} OnigToken;
static int
fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
{
int low, up, syn_allow, non_low = 0;
int r = 0;
OnigCodePoint c;
OnigEncoding enc = env->enc;
UChar* p = *src;
PFETCH_READY;
syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
if (PEND) {
if (syn_allow)
return 1; /* "....{" : OK! */
else
return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
}
if (! syn_allow) {
c = PPEEK;
if (c == ')' || c == '(' || c == '|') {
return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
}
}
low = onig_scan_unsigned_number(&p, end, env->enc);
if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
if (low > ONIG_MAX_REPEAT_NUM)
return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
if (p == *src) { /* can't read low */
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
/* allow {,n} as {0,n} */
low = 0;
non_low = 1;
}
else
goto invalid;
}
if (PEND) goto invalid;
PFETCH(c);
if (c == ',') {
UChar* prev = p;
up = onig_scan_unsigned_number(&p, end, env->enc);
if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
if (up > ONIG_MAX_REPEAT_NUM)
return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
if (p == prev) {
if (non_low != 0)
goto invalid;
up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
}
}
else {
if (non_low != 0)
goto invalid;
PUNFETCH;
up = low; /* {n} : exact n times */
r = 2; /* fixed */
}
if (PEND) goto invalid;
PFETCH(c);
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
if (c != MC_ESC(env->syntax)) goto invalid;
PFETCH(c);
}
if (c != '}') goto invalid;
if (!IS_REPEAT_INFINITE(up) && low > up) {
return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
}
tok->type = TK_INTERVAL;
tok->u.repeat.lower = low;
tok->u.repeat.upper = up;
*src = p;
return r; /* 0: normal {n,m}, 2: fixed {n} */
invalid:
if (syn_allow)
return 1; /* OK */
else
return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
}
/* \M-, \C-, \c, or \... */
static int
fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
{
int v;
OnigCodePoint c;
OnigEncoding enc = env->enc;
UChar* p = *src;
if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
PFETCH_S(c);
switch (c) {
case 'M':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
if (PEND) return ONIGERR_END_PATTERN_AT_META;
PFETCH_S(c);
if (c != '-') return ONIGERR_META_CODE_SYNTAX;
if (PEND) return ONIGERR_END_PATTERN_AT_META;
PFETCH_S(c);
if (c == MC_ESC(env->syntax)) {
v = fetch_escaped_value(&p, end, env);
if (v < 0) return v;
c = (OnigCodePoint )v;
}
c = ((c & 0xff) | 0x80);
}
else
goto backslash;
break;
case 'C':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
PFETCH_S(c);
if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
goto control;
}
else
goto backslash;
case 'c':
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
control:
if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
PFETCH_S(c);
if (c == '?') {
c = 0177;
}
else {
if (c == MC_ESC(env->syntax)) {
v = fetch_escaped_value(&p, end, env);
if (v < 0) return v;
c = (OnigCodePoint )v;
}
c &= 0x9f;
}
break;
}
/* fall through */
default:
{
backslash:
c = conv_backslash_value(c, env);
}
break;
}
*src = p;
return c;
}
static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)
{
switch (start) {
case '<': return (OnigCodePoint )'>'; break;
case '\'': return (OnigCodePoint )'\''; break;
case '(': return (OnigCodePoint )')'; break;
case '{': return (OnigCodePoint )'}'; break;
default:
break;
}
return (OnigCodePoint )0;
}
#ifdef USE_NAMED_GROUP
#define ONIGENC_IS_CODE_NAME(enc, c) TRUE
#ifdef USE_BACKREF_WITH_LEVEL
/*
\k<name+n>, \k<name-n>
\k<num+n>, \k<num-n>
\k<-num+n>, \k<-num-n>
*/
static int
fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
UChar** rname_end, ScanEnv* env,
int* rback_num, int* rlevel)
{
int r, sign, is_num, exist_level;
OnigCodePoint end_code;
OnigCodePoint c = 0;
OnigEncoding enc = env->enc;
UChar *name_end;
UChar *pnum_head;
UChar *p = *src;
PFETCH_READY;
*rback_num = 0;
is_num = exist_level = 0;
sign = 1;
pnum_head = *src;
end_code = get_name_end_code_point(start_code);
name_end = end;
r = 0;
if (PEND) {
return ONIGERR_EMPTY_GROUP_NAME;
}
else {
PFETCH(c);
if (c == end_code)
return ONIGERR_EMPTY_GROUP_NAME;
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
is_num = 1;
}
else if (c == '-') {
is_num = 2;
sign = -1;
pnum_head = p;
}
else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
}
while (!PEND) {
name_end = p;
PFETCH(c);
if (c == end_code || c == ')' || c == '+' || c == '-') {
if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
break;
}
if (is_num != 0) {
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
is_num = 1;
}
else {
r = ONIGERR_INVALID_GROUP_NAME;
is_num = 0;
}
}
else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
}
if (r == 0 && c != end_code) {
if (c == '+' || c == '-') {
int level;
int flag = (c == '-' ? -1 : 1);
PFETCH(c);
if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
PUNFETCH;
level = onig_scan_unsigned_number(&p, end, enc);
if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
*rlevel = (level * flag);
exist_level = 1;
PFETCH(c);
if (c == end_code)
goto end;
}
err:
r = ONIGERR_INVALID_GROUP_NAME;
name_end = end;
}
end:
if (r == 0) {
if (is_num != 0) {
*rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
else if (*rback_num == 0) goto err;
*rback_num *= sign;
}
*rname_end = name_end;
*src = p;
return (exist_level ? 1 : 0);
}
else {
onig_scan_env_set_error_string(env, r, *src, name_end);
return r;
}
}
#endif /* USE_BACKREF_WITH_LEVEL */
/*
ref: 0 -> define name (don't allow number name)
1 -> reference name (allow number name)
*/
static int
fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
{
int r, is_num, sign;
OnigCodePoint end_code;
OnigCodePoint c = 0;
OnigEncoding enc = env->enc;
UChar *name_end;
UChar *pnum_head;
UChar *p = *src;
*rback_num = 0;
end_code = get_name_end_code_point(start_code);
name_end = end;
pnum_head = *src;
r = 0;
is_num = 0;
sign = 1;
if (PEND) {
return ONIGERR_EMPTY_GROUP_NAME;
}
else {
PFETCH_S(c);
if (c == end_code)
return ONIGERR_EMPTY_GROUP_NAME;
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
if (ref == 1)
is_num = 1;
else {
r = ONIGERR_INVALID_GROUP_NAME;
is_num = 0;
}
}
else if (c == '-') {
if (ref == 1) {
is_num = 2;
sign = -1;
pnum_head = p;
}
else {
r = ONIGERR_INVALID_GROUP_NAME;
is_num = 0;
}
}
else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
}
if (r == 0) {
while (!PEND) {
name_end = p;
PFETCH_S(c);
if (c == end_code || c == ')') {
if (is_num == 2) {
r = ONIGERR_INVALID_GROUP_NAME;
goto teardown;
}
break;
}
if (is_num != 0) {
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
is_num = 1;
}
else {
if (!ONIGENC_IS_CODE_WORD(enc, c))
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
else
r = ONIGERR_INVALID_GROUP_NAME;
goto teardown;
}
}
else {
if (!ONIGENC_IS_CODE_NAME(enc, c)) {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
goto teardown;
}
}
}
if (c != end_code) {
r = ONIGERR_INVALID_GROUP_NAME;
name_end = end;
goto err;
}
if (is_num != 0) {
*rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
else if (*rback_num == 0) {
r = ONIGERR_INVALID_GROUP_NAME;
goto err;
}
*rback_num *= sign;
}
*rname_end = name_end;
*src = p;
return 0;
}
else {
teardown:
while (!PEND) {
name_end = p;
PFETCH_S(c);
if (c == end_code || c == ')')
break;
}
if (PEND)
name_end = end;
err:
onig_scan_env_set_error_string(env, r, *src, name_end);
return r;
}
}
#else
static int
fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
{
int r, is_num, sign;
OnigCodePoint end_code;
OnigCodePoint c = 0;
UChar *name_end;
OnigEncoding enc = env->enc;
UChar *pnum_head;
UChar *p = *src;
PFETCH_READY;
*rback_num = 0;
end_code = get_name_end_code_point(start_code);
*rname_end = name_end = end;
r = 0;
pnum_head = *src;
is_num = 0;
sign = 1;
if (PEND) {
return ONIGERR_EMPTY_GROUP_NAME;
}
else {
PFETCH(c);
if (c == end_code)
return ONIGERR_EMPTY_GROUP_NAME;
if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
is_num = 1;
}
else if (c == '-') {
is_num = 2;
sign = -1;
pnum_head = p;
}
else {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
}
while (!PEND) {
name_end = p;
PFETCH(c);
if (c == end_code || c == ')') break;
if (! ONIGENC_IS_CODE_DIGIT(enc, c))
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
if (r == 0 && c != end_code) {
r = ONIGERR_INVALID_GROUP_NAME;
name_end = end;
}
if (r == 0) {
*rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
else if (*rback_num == 0) {
r = ONIGERR_INVALID_GROUP_NAME;
goto err;
}
*rback_num *= sign;
*rname_end = name_end;
*src = p;
return 0;
}
else {
err:
onig_scan_env_set_error_string(env, r, *src, name_end);
return r;
}
}
#endif /* USE_NAMED_GROUP */
void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc,
UChar* pat, UChar* pat_end, const UChar *fmt, va_list args);
static void
onig_syntax_warn(ScanEnv *env, const char *fmt, ...)
{
va_list args;
UChar buf[WARN_BUFSIZE];
va_start(args, fmt);
onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
env->pattern, env->pattern_end,
(const UChar *)fmt, args);
va_end(args);
if (env->sourcefile == NULL)
rb_warn("%s", (char *)buf);
else
rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf);
}
static void
CC_ESC_WARN(ScanEnv *env, UChar *c)
{
if (onig_warn == onig_null_warn) return ;
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
onig_syntax_warn(env, "character class has '%s' without escape", c);
}
}
static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
{
if (onig_warn == onig_null_warn) return ;
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
onig_syntax_warn(env, "regular expression has '%s' without escape", c);
}
}
static void
CC_DUP_WARN(ScanEnv *env)
{
if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) &&
!(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
onig_syntax_warn(env, "character class has duplicated range");
}
}
static void
UNKNOWN_ESC_WARN(ScanEnv *env, int c)
{
if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
onig_syntax_warn(env, "Unknown escape \\%c is ignored", c);
}
static UChar*
find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
UChar **next, OnigEncoding enc)
{
int i;
OnigCodePoint x;
UChar *q;
UChar *p = from;
while (p < to) {
x = ONIGENC_MBC_TO_CODE(enc, p, to);
q = p + enclen(enc, p, to);
if (x == s[0]) {
for (i = 1; i < n && q < to; i++) {
x = ONIGENC_MBC_TO_CODE(enc, q, to);
if (x != s[i]) break;
q += enclen(enc, q, to);
}
if (i >= n) {
if (IS_NOT_NULL(next))
*next = q;
return p;
}
}
p = q;
}
return NULL_UCHARP;
}
static int
str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn)
{
int i, in_esc;
OnigCodePoint x;
UChar *q;
UChar *p = from;
in_esc = 0;
while (p < to) {
if (in_esc) {
in_esc = 0;
p += enclen(enc, p, to);
}
else {
x = ONIGENC_MBC_TO_CODE(enc, p, to);
q = p + enclen(enc, p, to);
if (x == s[0]) {
for (i = 1; i < n && q < to; i++) {
x = ONIGENC_MBC_TO_CODE(enc, q, to);
if (x != s[i]) break;
q += enclen(enc, q, to);
}
if (i >= n) return 1;
p += enclen(enc, p, to);
}
else {
x = ONIGENC_MBC_TO_CODE(enc, p, to);
if (x == bad) return 0;
else if (x == MC_ESC(syn)) in_esc = 1;
p = q;
}
}
}
return 0;
}
static int
fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
{
int num;
OnigCodePoint c, c2;
const OnigSyntaxType* syn = env->syntax;
OnigEncoding enc = env->enc;
UChar* prev;
UChar* p = *src;
PFETCH_READY;
if (PEND) {
tok->type = TK_EOT;
return tok->type;
}
PFETCH(c);
tok->type = TK_CHAR;
tok->base = 0;
tok->u.c = c;
tok->escaped = 0;
if (c == ']') {
tok->type = TK_CC_CLOSE;
}
else if (c == '-') {
tok->type = TK_CC_RANGE;
}
else if (c == MC_ESC(syn)) {
if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
goto end;
if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
PFETCH(c);
tok->escaped = 1;
tok->u.c = c;
switch (c) {
case 'w':
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 0;
break;
case 'W':
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 1;
break;
case 'd':
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 0;
break;
case 'D':
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 1;
break;
case 's':
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 0;
break;
case 'S':
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 1;
break;
case 'h':
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
tok->u.prop.not = 0;
break;
case 'H':
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
tok->u.prop.not = 1;
break;
case 'p':
case 'P':
c2 = PPEEK;
if (c2 == '{' &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
PINC;
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
PFETCH(c2);
if (c2 == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
}
else
PUNFETCH;
}
}
else {
onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
}
break;
case 'x':
if (PEND) break;
prev = p;
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
PINC;
num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
if (!PEND) {
c2 = PPEEK;
if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
}
if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
PINC;
tok->type = TK_CODE_POINT;
tok->base = 16;
tok->u.code = (OnigCodePoint )num;
}
else {
/* can't read nothing or invalid format */
p = prev;
}
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
tok->type = TK_RAW_BYTE;
tok->base = 16;
tok->u.c = num;
}
break;
case 'u':
if (PEND) break;
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
tok->type = TK_CODE_POINT;
tok->base = 16;
tok->u.code = (OnigCodePoint )num;
}
break;
case '0':
case '1': case '2': case '3': case '4': case '5': case '6': case '7':
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
PUNFETCH;
prev = p;
num = scan_unsigned_octal_number(&p, end, 3, enc);
if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
tok->type = TK_RAW_BYTE;
tok->base = 8;
tok->u.c = num;
}
break;
default:
PUNFETCH;
num = fetch_escaped_value(&p, end, env);
if (num < 0) return num;
if (tok->u.c != num) {
tok->u.code = (OnigCodePoint )num;
tok->type = TK_CODE_POINT;
}
break;
}
}
else if (c == '[') {
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
tok->backp = p; /* point at '[' is read */
PINC;
if (str_exist_check_with_esc(send, 2, p, end,
(OnigCodePoint )']', enc, syn)) {
tok->type = TK_POSIX_BRACKET_OPEN;
}
else {
PUNFETCH;
goto cc_in_cc;
}
}
else {
cc_in_cc:
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
tok->type = TK_CC_CC_OPEN;
}
else {
CC_ESC_WARN(env, (UChar* )"[");
}
}
}
else if (c == '&') {
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
!PEND && (PPEEK_IS('&'))) {
PINC;
tok->type = TK_CC_AND;
}
}
end:
*src = p;
return tok->type;
}
#ifdef USE_NAMED_GROUP
static int
fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
UChar* end, ScanEnv* env)
{
int r, num;
const OnigSyntaxType* syn = env->syntax;
UChar* prev;
UChar* p = *src;
UChar* name_end;
int* backs;
int back_num;
prev = p;
#ifdef USE_BACKREF_WITH_LEVEL
name_end = NULL_UCHARP; /* no need. escape gcc warning. */
r = fetch_name_with_level(c, &p, end, &name_end,
env, &back_num, &tok->u.backref.level);
if (r == 1) tok->u.backref.exist_level = 1;
else tok->u.backref.exist_level = 0;
#else
r = fetch_name(&p, end, &name_end, env, &back_num, 1);
#endif
if (r < 0) return r;
if (back_num != 0) {
if (back_num < 0) {
back_num = BACKREF_REL_TO_ABS(back_num, env);
if (back_num <= 0)
return ONIGERR_INVALID_BACKREF;
}
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
if (back_num > env->num_mem ||
IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
return ONIGERR_INVALID_BACKREF;
}
tok->type = TK_BACKREF;
tok->u.backref.by_name = 0;
tok->u.backref.num = 1;
tok->u.backref.ref1 = back_num;
}
else {
num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
if (num <= 0) {
onig_scan_env_set_error_string(env,
ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
return ONIGERR_UNDEFINED_NAME_REFERENCE;
}
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
int i;
for (i = 0; i < num; i++) {
if (backs[i] > env->num_mem ||
IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
return ONIGERR_INVALID_BACKREF;
}
}
tok->type = TK_BACKREF;
tok->u.backref.by_name = 1;
if (num == 1) {
tok->u.backref.num = 1;
tok->u.backref.ref1 = backs[0];
}
else {
tok->u.backref.num = num;
tok->u.backref.refs = backs;
}
}
*src = p;
return 0;
}
#endif
static int
fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
{
int r, num;
OnigCodePoint c;
OnigEncoding enc = env->enc;
const OnigSyntaxType* syn = env->syntax;
UChar* prev;
UChar* p = *src;
PFETCH_READY;
start:
if (PEND) {
tok->type = TK_EOT;
return tok->type;
}
tok->type = TK_STRING;
tok->base = 0;
tok->backp = p;
PFETCH(c);
if (IS_MC_ESC_CODE(c, syn)) {
if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
tok->backp = p;
PFETCH(c);
tok->u.c = c;
tok->escaped = 1;
switch (c) {
case '*':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
tok->type = TK_OP_REPEAT;
tok->u.repeat.lower = 0;
tok->u.repeat.upper = REPEAT_INFINITE;
goto greedy_check;
break;
case '+':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
tok->type = TK_OP_REPEAT;
tok->u.repeat.lower = 1;
tok->u.repeat.upper = REPEAT_INFINITE;
goto greedy_check;
break;
case '?':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
tok->type = TK_OP_REPEAT;
tok->u.repeat.lower = 0;
tok->u.repeat.upper = 1;
greedy_check:
if (!PEND && PPEEK_IS('?') &&
IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
PFETCH(c);
tok->u.repeat.greedy = 0;
tok->u.repeat.possessive = 0;
}
else {
possessive_check:
if (!PEND && PPEEK_IS('+') &&
((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
tok->type != TK_INTERVAL) ||
(IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
tok->type == TK_INTERVAL))) {
PFETCH(c);
tok->u.repeat.greedy = 1;
tok->u.repeat.possessive = 1;
}
else {
tok->u.repeat.greedy = 1;
tok->u.repeat.possessive = 0;
}
}
break;
case '{':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
r = fetch_range_quantifier(&p, end, tok, env);
if (r < 0) return r; /* error */
if (r == 0) goto greedy_check;
else if (r == 2) { /* {n} */
if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
goto possessive_check;
goto greedy_check;
}
/* r == 1 : normal char */
break;
case '|':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
tok->type = TK_ALT;
break;
case '(':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
tok->type = TK_SUBEXP_OPEN;
break;
case ')':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
tok->type = TK_SUBEXP_CLOSE;
break;
case 'w':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 0;
break;
case 'W':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 1;
break;
case 'b':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
&& ! IS_WORD_BOUND_ALL_RANGE(env->option);
break;
case 'B':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
&& ! IS_WORD_BOUND_ALL_RANGE(env->option);
break;
#ifdef USE_WORD_BEGIN_END
case '<':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
break;
case '>':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_WORD_END;
tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
break;
#endif
case 's':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 0;
break;
case 'S':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 1;
break;
case 'd':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 0;
break;
case 'D':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 1;
break;
case 'h':
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
tok->u.prop.not = 0;
break;
case 'H':
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
tok->type = TK_CHAR_TYPE;
tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
tok->u.prop.not = 1;
break;
case 'A':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
begin_buf:
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
break;
case 'Z':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
break;
case 'z':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
end_buf:
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_END_BUF;
break;
case 'G':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
break;
case '`':
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
goto begin_buf;
break;
case '\'':
if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
goto end_buf;
break;
case 'x':
if (PEND) break;
prev = p;
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
PINC;
num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
if (!PEND) {
if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
}
if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
PINC;
tok->type = TK_CODE_POINT;
tok->u.code = (OnigCodePoint )num;
}
else {
/* can't read nothing or invalid format */
p = prev;
}
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
tok->type = TK_RAW_BYTE;
tok->base = 16;
tok->u.c = num;
}
break;
case 'u':
if (PEND) break;
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
tok->type = TK_CODE_POINT;
tok->base = 16;
tok->u.code = (OnigCodePoint )num;
}
break;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
PUNFETCH;
prev = p;
num = onig_scan_unsigned_number(&p, end, enc);
if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
goto skip_backref;
}
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
(num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
return ONIGERR_INVALID_BACKREF;
}
tok->type = TK_BACKREF;
tok->u.backref.num = 1;
tok->u.backref.ref1 = num;
tok->u.backref.by_name = 0;
#ifdef USE_BACKREF_WITH_LEVEL
tok->u.backref.exist_level = 0;
#endif
break;
}
skip_backref:
if (c == '8' || c == '9') {
/* normal char */
p = prev; PINC;
break;
}
p = prev;
/* fall through */
case '0':
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
prev = p;
num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
tok->type = TK_RAW_BYTE;
tok->base = 8;
tok->u.c = num;
}
else if (c != '0') {
PINC;
}
break;
#ifdef USE_NAMED_GROUP
case 'k':
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
PFETCH(c);
if (c == '<' || c == '\'') {
r = fetch_named_backref_token(c, tok, &p, end, env);
if (r < 0) return r;
}
else {
PUNFETCH;
onig_syntax_warn(env, "invalid back reference");
}
}
break;
#endif
#if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
case 'g':
#ifdef USE_NAMED_GROUP
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
PFETCH(c);
if (c == '{') {
r = fetch_named_backref_token(c, tok, &p, end, env);
if (r < 0) return r;
}
else
PUNFETCH;
}
#endif
#ifdef USE_SUBEXP_CALL
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
PFETCH(c);
if (c == '<' || c == '\'') {
int gnum = -1, rel = 0;
UChar* name_end;
OnigCodePoint cnext;
cnext = PPEEK;
if (cnext == '0') {
PINC;
if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */
PINC;
name_end = p;
gnum = 0;
}
}
else if (cnext == '+') {
PINC;
rel = 1;
}
prev = p;
if (gnum < 0) {
r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
if (r < 0) return r;
}
tok->type = TK_CALL;
tok->u.call.name = prev;
tok->u.call.name_end = name_end;
tok->u.call.gnum = gnum;
tok->u.call.rel = rel;
}
else {
onig_syntax_warn(env, "invalid subexp call");
PUNFETCH;
}
}
#endif
break;
#endif
case 'Q':
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
tok->type = TK_QUOTE_OPEN;
}
break;
case 'p':
case 'P':
if (PPEEK_IS('{') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
PINC;
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
PFETCH(c);
if (c == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
}
else
PUNFETCH;
}
}
else {
onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
}
break;
case 'R':
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
tok->type = TK_LINEBREAK;
}
break;
case 'X':
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
}
break;
case 'K':
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
tok->type = TK_KEEP;
}
break;
default:
PUNFETCH;
num = fetch_escaped_value(&p, end, env);
if (num < 0) return num;
/* set_raw: */
if (tok->u.c != num) {
tok->type = TK_CODE_POINT;
tok->u.code = (OnigCodePoint )num;
}
else { /* string */
p = tok->backp + enclen(enc, tok->backp, end);
}
break;
}
}
else {
tok->u.c = c;
tok->escaped = 0;
#ifdef USE_VARIABLE_META_CHARS
if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
if (c == MC_ANYCHAR(syn))
goto any_char;
else if (c == MC_ANYTIME(syn))
goto anytime;
else if (c == MC_ZERO_OR_ONE_TIME(syn))
goto zero_or_one_time;
else if (c == MC_ONE_OR_MORE_TIME(syn))
goto one_or_more_time;
else if (c == MC_ANYCHAR_ANYTIME(syn)) {
tok->type = TK_ANYCHAR_ANYTIME;
goto out;
}
}
#endif
switch (c) {
case '.':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
#ifdef USE_VARIABLE_META_CHARS
any_char:
#endif
tok->type = TK_ANYCHAR;
break;
case '*':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
#ifdef USE_VARIABLE_META_CHARS
anytime:
#endif
tok->type = TK_OP_REPEAT;
tok->u.repeat.lower = 0;
tok->u.repeat.upper = REPEAT_INFINITE;
goto greedy_check;
break;
case '+':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
#ifdef USE_VARIABLE_META_CHARS
one_or_more_time:
#endif
tok->type = TK_OP_REPEAT;
tok->u.repeat.lower = 1;
tok->u.repeat.upper = REPEAT_INFINITE;
goto greedy_check;
break;
case '?':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
#ifdef USE_VARIABLE_META_CHARS
zero_or_one_time:
#endif
tok->type = TK_OP_REPEAT;
tok->u.repeat.lower = 0;
tok->u.repeat.upper = 1;
goto greedy_check;
break;
case '{':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
r = fetch_range_quantifier(&p, end, tok, env);
if (r < 0) return r; /* error */
if (r == 0) goto greedy_check;
else if (r == 2) { /* {n} */
if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
goto possessive_check;
goto greedy_check;
}
/* r == 1 : normal char */
break;
case '|':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
tok->type = TK_ALT;
break;
case '(':
if (PPEEK_IS('?') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
PINC;
if (PPEEK_IS('#')) {
PFETCH(c);
while (1) {
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
PFETCH(c);
if (c == MC_ESC(syn)) {
if (!PEND) PFETCH(c);
}
else {
if (c == ')') break;
}
}
goto start;
}
#ifdef USE_PERL_SUBEXP_CALL
/* (?&name), (?n), (?R), (?0), (?+n), (?-n) */
c = PPEEK;
if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
/* (?&name), (?n), (?R), (?0) */
int gnum;
UChar *name;
UChar *name_end;
if (c == 'R' || c == '0') {
PINC; /* skip 'R' / '0' */
if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
PINC; /* skip ')' */
name_end = name = p;
gnum = 0;
}
else {
int numref = 1;
if (c == '&') { /* (?&name) */
PINC;
numref = 0; /* don't allow number name */
}
name = p;
r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
if (r < 0) return r;
}
tok->type = TK_CALL;
tok->u.call.name = name;
tok->u.call.name_end = name_end;
tok->u.call.gnum = gnum;
tok->u.call.rel = 0;
break;
}
else if ((c == '-' || c == '+') &&
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
/* (?+n), (?-n) */
int gnum;
UChar *name;
UChar *name_end;
OnigCodePoint cnext;
PFETCH_READY;
PINC; /* skip '-' / '+' */
cnext = PPEEK;
if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
if (c == '-') PUNFETCH;
name = p;
r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
if (r < 0) return r;
tok->type = TK_CALL;
tok->u.call.name = name;
tok->u.call.name_end = name_end;
tok->u.call.gnum = gnum;
tok->u.call.rel = 1;
break;
}
}
#endif /* USE_PERL_SUBEXP_CALL */
#ifdef USE_CAPITAL_P_NAMED_GROUP
if (PPEEK_IS('P') &&
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
int gnum;
UChar *name;
UChar *name_end;
PFETCH_READY;
PINC; /* skip 'P' */
PFETCH(c);
if (c == '=') { /* (?P=name): backref */
r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
if (r < 0) return r;
break;
}
else if (c == '>') { /* (?P>name): subexp call */
name = p;
r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
if (r < 0) return r;
tok->type = TK_CALL;
tok->u.call.name = name;
tok->u.call.name_end = name_end;
tok->u.call.gnum = gnum;
tok->u.call.rel = 0;
break;
}
PUNFETCH;
}
#endif /* USE_CAPITAL_P_NAMED_GROUP */
PUNFETCH;
}
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
tok->type = TK_SUBEXP_OPEN;
break;
case ')':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
tok->type = TK_SUBEXP_CLOSE;
break;
case '^':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
break;
case '$':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
tok->type = TK_ANCHOR;
tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
break;
case '[':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
tok->type = TK_CC_OPEN;
break;
case ']':
if (*src > env->pattern) /* /].../ is allowed. */
CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
break;
case '#':
if (IS_EXTEND(env->option)) {
while (!PEND) {
PFETCH(c);
if (ONIGENC_IS_CODE_NEWLINE(enc, c))
break;
}
goto start;
break;
}
break;
case ' ': case '\t': case '\n': case '\r': case '\f':
if (IS_EXTEND(env->option))
goto start;
break;
default:
/* string */
break;
}
}
#ifdef USE_VARIABLE_META_CHARS
out:
#endif
*src = p;
return tok->type;
}
static int
add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
ScanEnv* env,
OnigCodePoint sb_out, const OnigCodePoint mbr[])
{
int i, r;
OnigCodePoint j;
int n = ONIGENC_CODE_RANGE_NUM(mbr);
if (not == 0) {
for (i = 0; i < n; i++) {
for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
if (j >= sb_out) {
if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
r = add_code_range_to_buf(&(cc->mbuf), env, j,
ONIGENC_CODE_RANGE_TO(mbr, i));
if (r != 0) return r;
i++;
}
goto sb_end;
}
BITSET_SET_BIT_CHKDUP(cc->bs, j);
}
}
sb_end:
for ( ; i < n; i++) {
r = add_code_range_to_buf(&(cc->mbuf), env,
ONIGENC_CODE_RANGE_FROM(mbr, i),
ONIGENC_CODE_RANGE_TO(mbr, i));
if (r != 0) return r;
}
}
else {
OnigCodePoint prev = 0;
for (i = 0; i < n; i++) {
for (j = prev;
j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
if (j >= sb_out) {
goto sb_end2;
}
BITSET_SET_BIT_CHKDUP(cc->bs, j);
}
prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
}
for (j = prev; j < sb_out; j++) {
BITSET_SET_BIT_CHKDUP(cc->bs, j);
}
sb_end2:
prev = sb_out;
for (i = 0; i < n; i++) {
if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
r = add_code_range_to_buf(&(cc->mbuf), env, prev,
ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
if (r != 0) return r;
}
prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
}
if (prev < 0x7fffffff) {
r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff);
if (r != 0) return r;
}
}
return 0;
}
static int
add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* env)
{
int maxcode;
int c, r;
const OnigCodePoint *ranges;
OnigCodePoint sb_out;
OnigEncoding enc = env->enc;
r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
if (r == 0) {
if (ascii_range) {
CClassNode ccwork;
initialize_cclass(&ccwork);
r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out,
ranges);
if (r == 0) {
if (not) {
r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE);
}
else {
CClassNode ccascii;
initialize_cclass(&ccascii);
if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
}
else {
bitset_set_range(env, ccascii.bs, 0x00, 0x7F);
}
r = and_cclass(&ccwork, &ccascii, env);
if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
}
if (r == 0) {
r = or_cclass(cc, &ccwork, env);
}
if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf);
}
}
else {
r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
}
return r;
}
else if (r != ONIG_NO_SUPPORT_CONFIG) {
return r;
}
maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
r = 0;
switch (ctype) {
case ONIGENC_CTYPE_ALPHA:
case ONIGENC_CTYPE_BLANK:
case ONIGENC_CTYPE_CNTRL:
case ONIGENC_CTYPE_DIGIT:
case ONIGENC_CTYPE_LOWER:
case ONIGENC_CTYPE_PUNCT:
case ONIGENC_CTYPE_SPACE:
case ONIGENC_CTYPE_UPPER:
case ONIGENC_CTYPE_XDIGIT:
case ONIGENC_CTYPE_ASCII:
case ONIGENC_CTYPE_ALNUM:
if (not != 0) {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
else {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
}
break;
case ONIGENC_CTYPE_GRAPH:
case ONIGENC_CTYPE_PRINT:
if (not != 0) {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
|| c >= maxcode)
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
if (ascii_range)
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
else {
for (c = 0; c < maxcode; c++) {
if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
if (! ascii_range)
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
break;
case ONIGENC_CTYPE_WORD:
if (not == 0) {
for (c = 0; c < maxcode; c++) {
if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
if (! ascii_range)
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
else {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
&& (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
if (ascii_range)
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
break;
default:
return ONIGERR_PARSER_BUG;
break;
}
return r;
}
static int
parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc,
UChar** src, UChar* end, ScanEnv* env)
{
#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
#define POSIX_BRACKET_NAME_MIN_LEN 4
static const PosixBracketEntryType PBS[] = {
POSIX_BRACKET_ENTRY_INIT("alnum", ONIGENC_CTYPE_ALNUM),
POSIX_BRACKET_ENTRY_INIT("alpha", ONIGENC_CTYPE_ALPHA),
POSIX_BRACKET_ENTRY_INIT("blank", ONIGENC_CTYPE_BLANK),
POSIX_BRACKET_ENTRY_INIT("cntrl", ONIGENC_CTYPE_CNTRL),
POSIX_BRACKET_ENTRY_INIT("digit", ONIGENC_CTYPE_DIGIT),
POSIX_BRACKET_ENTRY_INIT("graph", ONIGENC_CTYPE_GRAPH),
POSIX_BRACKET_ENTRY_INIT("lower", ONIGENC_CTYPE_LOWER),
POSIX_BRACKET_ENTRY_INIT("print", ONIGENC_CTYPE_PRINT),
POSIX_BRACKET_ENTRY_INIT("punct", ONIGENC_CTYPE_PUNCT),
POSIX_BRACKET_ENTRY_INIT("space", ONIGENC_CTYPE_SPACE),
POSIX_BRACKET_ENTRY_INIT("upper", ONIGENC_CTYPE_UPPER),
POSIX_BRACKET_ENTRY_INIT("xdigit", ONIGENC_CTYPE_XDIGIT),
POSIX_BRACKET_ENTRY_INIT("ascii", ONIGENC_CTYPE_ASCII),
POSIX_BRACKET_ENTRY_INIT("word", ONIGENC_CTYPE_WORD),
};
const PosixBracketEntryType *pb;
int not, i, r;
int ascii_range;
OnigCodePoint c;
OnigEncoding enc = env->enc;
UChar *p = *src;
if (PPEEK_IS('^')) {
PINC_S;
not = 1;
}
else
not = 0;
if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
goto not_posix_bracket;
ascii_range = IS_ASCII_RANGE(env->option) &&
! IS_POSIX_BRACKET_ALL_RANGE(env->option);
for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
p = (UChar* )onigenc_step(enc, p, end, pb->len);
if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env);
if (r != 0) return r;
if (IS_NOT_NULL(asc_cc)) {
if (pb->ctype != ONIGENC_CTYPE_WORD &&
pb->ctype != ONIGENC_CTYPE_ASCII &&
!ascii_range)
r = add_ctype_to_cc(asc_cc, pb->ctype, not, ascii_range, env);
if (r != 0) return r;
}
PINC_S; PINC_S;
*src = p;
return 0;
}
}
not_posix_bracket:
c = 0;
i = 0;
while (!PEND && ((c = PPEEK) != ':') && c != ']') {
PINC_S;
if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
}
if (c == ':' && ! PEND) {
PINC_S;
if (! PEND) {
PFETCH_S(c);
if (c == ']')
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
}
}
return 1; /* 1: is not POSIX bracket, but no error. */
}
static int
fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
{
int r;
OnigCodePoint c;
OnigEncoding enc = env->enc;
UChar *prev, *start, *p = *src;
r = 0;
start = prev = p;
while (!PEND) {
prev = p;
PFETCH_S(c);
if (c == '}') {
r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
if (r < 0) break;
*src = p;
return r;
}
else if (c == '(' || c == ')' || c == '{' || c == '|') {
r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
break;
}
}
onig_scan_env_set_error_string(env, r, *src, prev);
return r;
}
static int cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env);
static int
parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
ScanEnv* env)
{
int r, ctype;
CClassNode* cc;
ctype = fetch_char_property_to_ctype(src, end, env);
if (ctype < 0) return ctype;
*np = node_new_cclass();
CHECK_NULL_RETURN_MEMERR(*np);
cc = NCCLASS(*np);
r = add_ctype_to_cc(cc, ctype, 0, 0, env);
if (r != 0) return r;
if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
if (IS_IGNORECASE(env->option)) {
if (ctype != ONIGENC_CTYPE_ASCII)
r = cclass_case_fold(np, cc, cc, env);
}
return r;
}
enum CCSTATE {
CCS_VALUE,
CCS_RANGE,
CCS_COMPLETE,
CCS_START
};
enum CCVALTYPE {
CCV_SB,
CCV_CODE_POINT,
CCV_CLASS
};
static int
next_state_class(CClassNode* cc, CClassNode* asc_cc,
OnigCodePoint* vs, enum CCVALTYPE* type,
enum CCSTATE* state, ScanEnv* env)
{
int r;
if (*state == CCS_RANGE)
return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
if (*state == CCS_VALUE && *type != CCV_CLASS) {
if (*type == CCV_SB) {
BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
if (IS_NOT_NULL(asc_cc))
BITSET_SET_BIT(asc_cc->bs, (int )(*vs));
}
else if (*type == CCV_CODE_POINT) {
r = add_code_range(&(cc->mbuf), env, *vs, *vs);
if (r < 0) return r;
if (IS_NOT_NULL(asc_cc)) {
r = add_code_range0(&(asc_cc->mbuf), env, *vs, *vs, 0);
if (r < 0) return r;
}
}
}
*state = CCS_VALUE;
*type = CCV_CLASS;
return 0;
}
static int
next_state_val(CClassNode* cc, CClassNode* asc_cc,
OnigCodePoint *vs, OnigCodePoint v,
int* vs_israw, int v_israw,
enum CCVALTYPE intype, enum CCVALTYPE* type,
enum CCSTATE* state, ScanEnv* env)
{
int r;
switch (*state) {
case CCS_VALUE:
if (*type == CCV_SB) {
BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
if (IS_NOT_NULL(asc_cc))
BITSET_SET_BIT(asc_cc->bs, (int )(*vs));
}
else if (*type == CCV_CODE_POINT) {
r = add_code_range(&(cc->mbuf), env, *vs, *vs);
if (r < 0) return r;
if (IS_NOT_NULL(asc_cc)) {
r = add_code_range0(&(asc_cc->mbuf), env, *vs, *vs, 0);
if (r < 0) return r;
}
}
break;
case CCS_RANGE:
if (intype == *type) {
if (intype == CCV_SB) {
if (*vs > 0xff || v > 0xff)
return ONIGERR_INVALID_CODE_POINT_VALUE;
if (*vs > v) {
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
goto ccs_range_end;
else
return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
}
bitset_set_range(env, cc->bs, (int )*vs, (int )v);
if (IS_NOT_NULL(asc_cc))
bitset_set_range(env, asc_cc->bs, (int )*vs, (int )v);
}
else {
r = add_code_range(&(cc->mbuf), env, *vs, v);
if (r < 0) return r;
if (IS_NOT_NULL(asc_cc)) {
r = add_code_range0(&(asc_cc->mbuf), env, *vs, v, 0);
if (r < 0) return r;
}
}
}
else {
#if 0
if (intype == CCV_CODE_POINT && *type == CCV_SB) {
#endif
if (*vs > v) {
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
goto ccs_range_end;
else
return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
}
bitset_set_range(env, cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
if (r < 0) return r;
if (IS_NOT_NULL(asc_cc)) {
bitset_set_range(env, asc_cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
r = add_code_range0(&(asc_cc->mbuf), env, (OnigCodePoint )*vs, v, 0);
if (r < 0) return r;
}
#if 0
}
else
return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
#endif
}
ccs_range_end:
*state = CCS_COMPLETE;
break;
case CCS_COMPLETE:
case CCS_START:
*state = CCS_VALUE;
break;
default:
break;
}
*vs_israw = v_israw;
*vs = v;
*type = intype;
return 0;
}
static int
code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
ScanEnv* env)
{
int in_esc;
OnigCodePoint code;
OnigEncoding enc = env->enc;
UChar* p = from;
in_esc = 0;
while (! PEND) {
if (ignore_escaped && in_esc) {
in_esc = 0;
}
else {
PFETCH_S(code);
if (code == c) return 1;
if (code == MC_ESC(env->syntax)) in_esc = 1;
}
}
return 0;
}
static int
parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* end,
ScanEnv* env)
{
int r, neg, len, fetched, and_start;
OnigCodePoint v, vs;
UChar *p;
Node* node;
Node* asc_node;
CClassNode *cc, *prev_cc;
CClassNode *asc_cc, *asc_prev_cc;
CClassNode work_cc, asc_work_cc;
enum CCSTATE state;
enum CCVALTYPE val_type, in_type;
int val_israw, in_israw;
prev_cc = asc_prev_cc = (CClassNode* )NULL;
*np = *asc_np = NULL_NODE;
r = fetch_token_in_cc(tok, src, end, env);
if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
neg = 1;
r = fetch_token_in_cc(tok, src, end, env);
}
else {
neg = 0;
}
if (r < 0) return r;
if (r == TK_CC_CLOSE) {
if (! code_exist_check((OnigCodePoint )']',
*src, env->pattern_end, 1, env))
return ONIGERR_EMPTY_CHAR_CLASS;
CC_ESC_WARN(env, (UChar* )"]");
r = tok->type = TK_CHAR; /* allow []...] */
}
*np = node = node_new_cclass();
CHECK_NULL_RETURN_MEMERR(node);
cc = NCCLASS(node);
if (IS_IGNORECASE(env->option)) {
*asc_np = asc_node = node_new_cclass();
CHECK_NULL_RETURN_MEMERR(asc_node);
asc_cc = NCCLASS(asc_node);
}
else {
asc_node = NULL_NODE;
asc_cc = NULL;
}
and_start = 0;
state = CCS_START;
p = *src;
while (r != TK_CC_CLOSE) {
fetched = 0;
switch (r) {
case TK_CHAR:
if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
(len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
in_type = CCV_CODE_POINT;
}
else if (len < 0) {
r = len;
goto err;
}
else {
sb_char:
in_type = CCV_SB;
}
v = (OnigCodePoint )tok->u.c;
in_israw = 0;
goto val_entry2;
break;
case TK_RAW_BYTE:
/* tok->base != 0 : octal or hexadec. */
if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
UChar* psave = p;
int i, base = tok->base;
buf[0] = (UChar )tok->u.c;
for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
r = fetch_token_in_cc(tok, &p, end, env);
if (r < 0) goto err;
if (r != TK_RAW_BYTE || tok->base != base) {
fetched = 1;
break;
}
buf[i] = (UChar )tok->u.c;
}
if (i < ONIGENC_MBC_MINLEN(env->enc)) {
r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
goto err;
}
len = enclen(env->enc, buf, buf+i);
if (i < len) {
r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
goto err;
}
else if (i > len) { /* fetch back */
p = psave;
for (i = 1; i < len; i++) {
r = fetch_token_in_cc(tok, &p, end, env);
}
fetched = 0;
}
if (i == 1) {
v = (OnigCodePoint )buf[0];
goto raw_single;
}
else {
v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
in_type = CCV_CODE_POINT;
}
}
else {
v = (OnigCodePoint )tok->u.c;
raw_single:
in_type = CCV_SB;
}
in_israw = 1;
goto val_entry2;
break;
case TK_CODE_POINT:
v = tok->u.code;
in_israw = 1;
val_entry:
len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
if (len < 0) {
r = len;
goto err;
}
in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
val_entry2:
r = next_state_val(cc, asc_cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
&state, env);
if (r != 0) goto err;
break;
case TK_POSIX_BRACKET_OPEN:
r = parse_posix_bracket(cc, asc_cc, &p, end, env);
if (r < 0) goto err;
if (r == 1) { /* is not POSIX bracket */
CC_ESC_WARN(env, (UChar* )"[");
p = tok->backp;
v = (OnigCodePoint )tok->u.c;
in_israw = 0;
goto val_entry;
}
goto next_class;
break;
case TK_CHAR_TYPE:
r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not,
IS_ASCII_RANGE(env->option), env);
if (r != 0) return r;
if (IS_NOT_NULL(asc_cc)) {
if (tok->u.prop.ctype != ONIGENC_CTYPE_WORD)
r = add_ctype_to_cc(asc_cc, tok->u.prop.ctype, tok->u.prop.not,
IS_ASCII_RANGE(env->option), env);
if (r != 0) return r;
}
next_class:
r = next_state_class(cc, asc_cc, &vs, &val_type, &state, env);
if (r != 0) goto err;
break;
case TK_CHAR_PROPERTY:
{
int ctype;
ctype = fetch_char_property_to_ctype(&p, end, env);
if (ctype < 0) return ctype;
r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 0, env);
if (r != 0) return r;
if (IS_NOT_NULL(asc_cc)) {
if (ctype != ONIGENC_CTYPE_ASCII)
r = add_ctype_to_cc(asc_cc, ctype, tok->u.prop.not, 0, env);
if (r != 0) return r;
}
goto next_class;
}
break;
case TK_CC_RANGE:
if (state == CCS_VALUE) {
r = fetch_token_in_cc(tok, &p, end, env);
if (r < 0) goto err;
fetched = 1;
if (r == TK_CC_CLOSE) { /* allow [x-] */
range_end_val:
v = (OnigCodePoint )'-';
in_israw = 0;
goto val_entry;
}
else if (r == TK_CC_AND) {
CC_ESC_WARN(env, (UChar* )"-");
goto range_end_val;
}
state = CCS_RANGE;
}
else if (state == CCS_START) {
/* [-xa] is allowed */
v = (OnigCodePoint )tok->u.c;
in_israw = 0;
r = fetch_token_in_cc(tok, &p, end, env);
if (r < 0) goto err;
fetched = 1;
/* [--x] or [a&&-x] is warned. */
if (r == TK_CC_RANGE || and_start != 0)
CC_ESC_WARN(env, (UChar* )"-");
goto val_entry;
}
else if (state == CCS_RANGE) {
CC_ESC_WARN(env, (UChar* )"-");
goto sb_char; /* [!--x] is allowed */
}
else { /* CCS_COMPLETE */
r = fetch_token_in_cc(tok, &p, end, env);
if (r < 0) goto err;
fetched = 1;
if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
else if (r == TK_CC_AND) {
CC_ESC_WARN(env, (UChar* )"-");
goto range_end_val;
}
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
CC_ESC_WARN(env, (UChar* )"-");
goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
}
r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
goto err;
}
break;
case TK_CC_CC_OPEN: /* [ */
{
Node *anode, *aasc_node;
CClassNode* acc;
r = parse_char_class(&anode, &aasc_node, tok, &p, end, env);
if (r == 0) {
acc = NCCLASS(anode);
r = or_cclass(cc, acc, env);
}
if (r == 0 && IS_NOT_NULL(aasc_node)) {
acc = NCCLASS(aasc_node);
r = or_cclass(asc_cc, acc, env);
}
onig_node_free(anode);
onig_node_free(aasc_node);
if (r != 0) goto err;
}
break;
case TK_CC_AND: /* && */
{
if (state == CCS_VALUE) {
r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
&val_type, &state, env);
if (r != 0) goto err;
}
/* initialize local variables */
and_start = 1;
state = CCS_START;
if (IS_NOT_NULL(prev_cc)) {
r = and_cclass(prev_cc, cc, env);
if (r != 0) goto err;
bbuf_free(cc->mbuf);
if (IS_NOT_NULL(asc_cc)) {
r = and_cclass(asc_prev_cc, asc_cc, env);
if (r != 0) goto err;
bbuf_free(asc_cc->mbuf);
}
}
else {
prev_cc = cc;
cc = &work_cc;
if (IS_NOT_NULL(asc_cc)) {
asc_prev_cc = asc_cc;
asc_cc = &asc_work_cc;
}
}
initialize_cclass(cc);
if (IS_NOT_NULL(asc_cc))
initialize_cclass(asc_cc);
}
break;
case TK_EOT:
r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
goto err;
break;
default:
r = ONIGERR_PARSER_BUG;
goto err;
break;
}
if (fetched)
r = tok->type;
else {
r = fetch_token_in_cc(tok, &p, end, env);
if (r < 0) goto err;
}
}
if (state == CCS_VALUE) {
r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
&val_type, &state, env);
if (r != 0) goto err;
}
if (IS_NOT_NULL(prev_cc)) {
r = and_cclass(prev_cc, cc, env);
if (r != 0) goto err;
bbuf_free(cc->mbuf);
cc = prev_cc;
if (IS_NOT_NULL(asc_cc)) {
r = and_cclass(asc_prev_cc, asc_cc, env);
if (r != 0) goto err;
bbuf_free(asc_cc->mbuf);
asc_cc = asc_prev_cc;
}
}
if (neg != 0) {
NCCLASS_SET_NOT(cc);
if (IS_NOT_NULL(asc_cc))
NCCLASS_SET_NOT(asc_cc);
}
else {
NCCLASS_CLEAR_NOT(cc);
if (IS_NOT_NULL(asc_cc))
NCCLASS_CLEAR_NOT(asc_cc);
}
if (IS_NCCLASS_NOT(cc) &&
IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
int is_empty;
is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
if (is_empty != 0)
BITSET_IS_EMPTY(cc->bs, is_empty);
if (is_empty == 0) {
#define NEWLINE_CODE 0x0a
if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
else {
r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
if (r < 0) goto err;
}
}
}
}
*src = p;
return 0;
err:
if (cc != NCCLASS(*np))
bbuf_free(cc->mbuf);
if (IS_NOT_NULL(asc_cc) && (asc_cc != NCCLASS(*asc_np)))
bbuf_free(asc_cc->mbuf);
return r;
}
static int parse_subexp(Node** top, OnigToken* tok, int term,
UChar** src, UChar* end, ScanEnv* env);
static int
parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
ScanEnv* env)
{
int r = 0, num;
Node *target, *work1 = NULL, *work2 = NULL;
OnigOptionType option;
OnigCodePoint c;
OnigEncoding enc = env->enc;
#ifdef USE_NAMED_GROUP
int list_capture;
#endif
UChar* p = *src;
PFETCH_READY;
*np = NULL;
if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
option = env->option;
if (PPEEK_IS('?') &&
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
PINC;
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
PFETCH(c);
switch (c) {
case ':': /* (?:...) grouping only */
group:
r = fetch_token(tok, &p, end, env);
if (r < 0) return r;
r = parse_subexp(np, tok, term, &p, end, env);
if (r < 0) return r;
*src = p;
return 1; /* group */
break;
case '=':
*np = onig_node_new_anchor(ANCHOR_PREC_READ);
break;
case '!': /* preceding read */
*np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
break;
case '>': /* (?>...) stop backtrack */
*np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
break;
#ifdef USE_NAMED_GROUP
case '\'':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
goto named_group1;
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
#ifdef USE_CAPITAL_P_NAMED_GROUP
case 'P': /* (?P<name>...) */
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
PFETCH(c);
if (c == '<') goto named_group1;
}
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
#endif
#endif
case '<': /* look behind (?<=...), (?<!...) */
PFETCH(c);
if (c == '=')
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
else if (c == '!')
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
#ifdef USE_NAMED_GROUP
else { /* (?<name>...) */
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
UChar *name;
UChar *name_end;
PUNFETCH;
c = '<';
named_group1:
list_capture = 0;
named_group2:
name = p;
r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
if (r < 0) return r;
num = scan_env_add_mem_entry(env);
if (num < 0) return num;
if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
r = name_add(env->reg, name, name_end, num, env);
if (r != 0) return r;
*np = node_new_enclose_memory(env->option, 1);
CHECK_NULL_RETURN_MEMERR(*np);
NENCLOSE(*np)->regnum = num;
if (list_capture != 0)
BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
env->num_named++;
}
else {
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
}
#else
else {
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
#endif
break;
case '@':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
#ifdef USE_NAMED_GROUP
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
PFETCH(c);
if (c == '<' || c == '\'') {
list_capture = 1;
goto named_group2; /* (?@<name>...) */
}
PUNFETCH;
}
#endif
*np = node_new_enclose_memory(env->option, 0);
CHECK_NULL_RETURN_MEMERR(*np);
num = scan_env_add_mem_entry(env);
if (num < 0) return num;
if (num >= (int )BIT_STATUS_BITS_NUM)
return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
NENCLOSE(*np)->regnum = num;
BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
}
else {
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
break;
case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
UChar *name = NULL;
UChar *name_end;
PFETCH(c);
if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */
PUNFETCH;
r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
if (r < 0) return r;
#if 0
/* Relative number is not currently supported. (same as Perl) */
if (num < 0) {
num = BACKREF_REL_TO_ABS(num, env);
if (num <= 0)
return ONIGERR_INVALID_BACKREF;
}
#endif
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
if (num > env->num_mem ||
IS_NULL(SCANENV_MEM_NODES(env)[num]))
return ONIGERR_INVALID_BACKREF;
}
}
#ifdef USE_NAMED_GROUP
else if (c == '<' || c == '\'') { /* (<name>), ('name') */
int nums;
int *backs;
name = p;
r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
if (r < 0) return r;
PFETCH(c);
if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION;
nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs);
if (nums <= 0) {
onig_scan_env_set_error_string(env,
ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
return ONIGERR_UNDEFINED_NAME_REFERENCE;
}
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
int i;
for (i = 0; i < nums; i++) {
if (backs[i] > env->num_mem ||
IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
return ONIGERR_INVALID_BACKREF;
}
}
num = backs[0]; /* XXX: use left most named group as Perl */
}
#endif
else
return ONIGERR_INVALID_CONDITION_PATTERN;
*np = node_new_enclose(ENCLOSE_CONDITION);
CHECK_NULL_RETURN_MEMERR(*np);
NENCLOSE(*np)->regnum = num;
if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
#if 0
case '|': /* branch reset: (?|...) */
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
/* TODO */
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
#endif
case '^': /* loads default options */
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
/* d-imsx */
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
ONOFF(option, ONIG_OPTION_MULTILINE, 1);
ONOFF(option, ONIG_OPTION_EXTEND, 1);
PFETCH(c);
}
#if 0
else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
/* d-imx */
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
ONOFF(option, ONIG_OPTION_MULTILINE, 1);
ONOFF(option, ONIG_OPTION_EXTEND, 1);
PFETCH(c);
}
#endif
else {
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
/* fall through */
#ifdef USE_POSIXLINE_OPTION
case 'p':
#endif
case '-': case 'i': case 'm': case 's': case 'x':
case 'a': case 'd': case 'l': case 'u':
{
int neg = 0;
while (1) {
switch (c) {
case ':':
case ')':
break;
case '-': neg = 1; break;
case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
case 's':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
ONOFF(option, ONIG_OPTION_MULTILINE, neg);
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
case 'm':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
}
else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
ONOFF(option, ONIG_OPTION_MULTILINE, neg);
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
#ifdef USE_POSIXLINE_OPTION
case 'p':
ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
break;
#endif
case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */
if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
(neg == 0)) {
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
case 'u':
if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
(neg == 0)) {
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
case 'd':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
(neg == 0)) {
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
}
else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
(neg == 0)) {
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
case 'l':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
}
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
default:
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
if (c == ')') {
*np = node_new_option(option);
CHECK_NULL_RETURN_MEMERR(*np);
*src = p;
return 2; /* option only */
}
else if (c == ':') {
OnigOptionType prev = env->option;
env->option = option;
r = fetch_token(tok, &p, end, env);
if (r < 0) return r;
r = parse_subexp(&target, tok, term, &p, end, env);
env->option = prev;
if (r < 0) return r;
*np = node_new_option(option);
CHECK_NULL_RETURN_MEMERR(*np);
NENCLOSE(*np)->target = target;
*src = p;
return 0;
}
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
PFETCH(c);
}
}
break;
default:
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
}
else {
if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
goto group;
*np = node_new_enclose_memory(env->option, 0);
CHECK_NULL_RETURN_MEMERR(*np);
num = scan_env_add_mem_entry(env);
if (num < 0) return num;
NENCLOSE(*np)->regnum = num;
}
CHECK_NULL_RETURN_MEMERR(*np);
r = fetch_token(tok, &p, end, env);
if (r < 0) return r;
r = parse_subexp(&target, tok, term, &p, end, env);
if (r < 0) {
onig_node_free(target);
return r;
}
if (NTYPE(*np) == NT_ANCHOR)
NANCHOR(*np)->target = target;
else {
NENCLOSE(*np)->target = target;
if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
/* Don't move this to previous of parse_subexp() */
r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
if (r != 0) return r;
}
else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
if (NTYPE(target) != NT_ALT) {
/* convert (?(cond)yes) to (?(cond)yes|empty) */
work1 = node_new_empty();
if (IS_NULL(work1)) goto err;
work2 = onig_node_new_alt(work1, NULL_NODE);
if (IS_NULL(work2)) goto err;
work1 = onig_node_new_alt(target, work2);
if (IS_NULL(work1)) goto err;
NENCLOSE(*np)->target = work1;
}
}
}
*src = p;
return 0;
err:
onig_node_free(work1);
onig_node_free(work2);
onig_node_free(*np);
*np = NULL;
return ONIGERR_MEMORY;
}
static const char* const PopularQStr[] = {
"?", "*", "+", "??", "*?", "+?"
};
static const char* const ReduceQStr[] = {
"", "", "*", "*?", "??", "+ and ??", "+? and ?"
};
static int
set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
{
QtfrNode* qn;
qn = NQTFR(qnode);
if (qn->lower == 1 && qn->upper == 1) {
return 1;
}
switch (NTYPE(target)) {
case NT_STR:
if (! group) {
StrNode* sn = NSTR(target);
if (str_node_can_be_split(sn, env->enc)) {
Node* n = str_node_split_last_char(sn, env->enc);
if (IS_NOT_NULL(n)) {
qn->target = n;
return 2;
}
}
}
break;
case NT_QTFR:
{ /* check redundant double repeat. */
/* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
QtfrNode* qnt = NQTFR(target);
int nestq_num = popular_quantifier_num(qn);
int targetq_num = popular_quantifier_num(qnt);
#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
switch (ReduceTypeTable[targetq_num][nestq_num]) {
case RQ_ASIS:
break;
case RQ_DEL:
if (onig_warn != onig_null_warn) {
onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'",
PopularQStr[targetq_num]);
}
goto warn_exit;
break;
default:
if (onig_warn != onig_null_warn) {
onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression",
PopularQStr[targetq_num], PopularQStr[nestq_num],
ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
}
goto warn_exit;
break;
}
}
warn_exit:
#endif
if (targetq_num >= 0) {
if (nestq_num >= 0) {
onig_reduce_nested_quantifier(qnode, target);
goto q_exit;
}
else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
/* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
qn->upper = (qn->lower == 0 ? 1 : qn->lower);
}
}
}
}
break;
default:
break;
}
qn->target = target;
q_exit:
return 0;
}
#ifdef USE_SHARED_CCLASS_TABLE
#define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
/* for ctype node hash table */
typedef struct {
OnigEncoding enc;
int not;
int type;
} type_cclass_key;
static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
{
if (x->type != y->type) return 1;
if (x->enc != y->enc) return 1;
if (x->not != y->not) return 1;
return 0;
}
static st_index_t type_cclass_hash(type_cclass_key* key)
{
int i, val;
UChar *p;
val = 0;
p = (UChar* )&(key->enc);
for (i = 0; i < (int )sizeof(key->enc); i++) {
val = val * 997 + (int )*p++;
}
p = (UChar* )(&key->type);
for (i = 0; i < (int )sizeof(key->type); i++) {
val = val * 997 + (int )*p++;
}
val += key->not;
return val + (val >> 5);
}
static const struct st_hash_type type_type_cclass_hash = {
type_cclass_cmp,
type_cclass_hash,
};
static st_table* OnigTypeCClassTable;
static int
i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
{
if (IS_NOT_NULL(node)) {
CClassNode* cc = NCCLASS(node);
if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
xfree(node);
}
if (IS_NOT_NULL(key)) xfree(key);
return ST_DELETE;
}
extern int
onig_free_shared_cclass_table(void)
{
/* THREAD_ATOMIC_START; */
if (IS_NOT_NULL(OnigTypeCClassTable)) {
onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
onig_st_free_table(OnigTypeCClassTable);
OnigTypeCClassTable = NULL;
}
/* THREAD_ATOMIC_END; */
return 0;
}
#endif /* USE_SHARED_CCLASS_TABLE */
#ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
static int
clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
{
BBuf *tbuf;
int r;
if (IS_NCCLASS_NOT(cc)) {
bitset_invert(cc->bs);
if (! ONIGENC_IS_SINGLEBYTE(enc)) {
r = not_code_range_buf(enc, cc->mbuf, &tbuf);
if (r != 0) return r;
bbuf_free(cc->mbuf);
cc->mbuf = tbuf;
}
NCCLASS_CLEAR_NOT(cc);
}
return 0;
}
#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
typedef struct {
ScanEnv* env;
CClassNode* cc;
CClassNode* asc_cc;
Node* alt_root;
Node** ptail;
} IApplyCaseFoldArg;
static int
i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
int to_len, void* arg)
{
IApplyCaseFoldArg* iarg;
ScanEnv* env;
CClassNode* cc;
CClassNode* asc_cc;
BitSetRef bs;
int add_flag;
iarg = (IApplyCaseFoldArg* )arg;
env = iarg->env;
cc = iarg->cc;
asc_cc = iarg->asc_cc;
bs = cc->bs;
if (IS_NULL(asc_cc)) {
add_flag = 0;
}
else if (ONIGENC_IS_ASCII_CODE(from) == ONIGENC_IS_ASCII_CODE(*to)) {
add_flag = 1;
}
else {
add_flag = onig_is_code_in_cc(env->enc, from, asc_cc);
if (IS_NCCLASS_NOT(asc_cc))
add_flag = !add_flag;
}
if (to_len == 1) {
int is_in = onig_is_code_in_cc(env->enc, from, cc);
#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
(is_in == 0 && IS_NCCLASS_NOT(cc))) {
if (add_flag) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
add_code_range0(&(cc->mbuf), env, *to, *to, 0);
}
else {
BITSET_SET_BIT(bs, *to);
}
}
}
#else
if (is_in != 0) {
if (add_flag) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
add_code_range0(&(cc->mbuf), env, *to, *to, 0);
}
else {
if (IS_NCCLASS_NOT(cc)) {
BITSET_CLEAR_BIT(bs, *to);
}
else {
BITSET_SET_BIT(bs, *to);
}
}
}
}
#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
}
else {
int r, i, len;
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
Node *snode = NULL_NODE;
if (onig_is_code_in_cc(env->enc, from, cc)
#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
&& !IS_NCCLASS_NOT(cc)
#endif
) {
for (i = 0; i < to_len; i++) {
len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
if (i == 0) {
snode = onig_node_new_str(buf, buf + len);
CHECK_NULL_RETURN_MEMERR(snode);
/* char-class expanded multi-char only
compare with string folded at match time. */
NSTRING_SET_AMBIG(snode);
}
else {
r = onig_node_str_cat(snode, buf, buf + len);
if (r < 0) {
onig_node_free(snode);
return r;
}
}
}
*(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
iarg->ptail = &(NCDR((*(iarg->ptail))));
}
}
return 0;
}
static int
cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env)
{
int r;
IApplyCaseFoldArg iarg;
iarg.env = env;
iarg.cc = cc;
iarg.asc_cc = asc_cc;
iarg.alt_root = NULL_NODE;
iarg.ptail = &(iarg.alt_root);
r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
i_apply_case_fold, &iarg);
if (r != 0) {
onig_node_free(iarg.alt_root);
return r;
}
if (IS_NOT_NULL(iarg.alt_root)) {
Node* work = onig_node_new_alt(*np, iarg.alt_root);
if (IS_NULL(work)) {
onig_node_free(iarg.alt_root);
return ONIGERR_MEMORY;
}
*np = work;
}
return r;
}
static int
node_linebreak(Node** np, ScanEnv* env)
{
/* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */
Node* left = NULL;
Node* right = NULL;
Node* target1 = NULL;
Node* target2 = NULL;
CClassNode* cc;
int num1, num2;
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
/* \x0D\x0A */
num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
if (num1 < 0) return num1;
num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
if (num2 < 0) return num2;
left = node_new_str_raw(buf, buf + num1 + num2);
if (IS_NULL(left)) goto err;
/* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */
right = node_new_cclass();
if (IS_NULL(right)) goto err;
cc = NCCLASS(right);
if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
}
else {
bitset_set_range(env, cc->bs, 0x0A, 0x0D);
}
/* TODO: move this block to enc/unicode.c */
if (ONIGENC_IS_UNICODE(env->enc)) {
/* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
add_code_range(&(cc->mbuf), env, 0x85, 0x85);
add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
}
/* ...|... */
target1 = onig_node_new_alt(right, NULL_NODE);
if (IS_NULL(target1)) goto err;
right = NULL;
target2 = onig_node_new_alt(left, target1);
if (IS_NULL(target2)) goto err;
left = NULL;
target1 = NULL;
/* (?>...) */
*np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
if (IS_NULL(*np)) goto err;
NENCLOSE(*np)->target = target2;
return ONIG_NORMAL;
err:
onig_node_free(left);
onig_node_free(right);
onig_node_free(target1);
onig_node_free(target2);
return ONIGERR_MEMORY;
}
static int
propname2ctype(ScanEnv* env, const char* propname)
{
UChar* name = (UChar*)propname;
int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
name, name + strlen(propname));
return ctype;
}
static int
node_extended_grapheme_cluster(Node** np, ScanEnv* env)
{
Node* np1 = NULL;
Node* list = NULL;
Node* list2 = NULL;
Node* alt = NULL;
Node* alt2 = NULL;
int r = 0;
#ifdef USE_UNICODE_PROPERTIES
if (ONIGENC_IS_UNICODE(env->enc)) {
/* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
Node* tmp = NULL;
int num1, num2;
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
CClassNode* cc;
OnigOptionType option;
int extend = propname2ctype(env, "Grapheme_Cluster_Break=Extend");
/* Prepend*
* ( RI-sequence | Hangul-Syllable | !Control )
* ( Grapheme_Extend | SpacingMark )* */
/* ( Grapheme_Extend | SpacingMark )* */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, extend, 0, 0, env);
if (r != 0) goto err;
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=SpacingMark"), 0, 0, env);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list = tmp;
np1 = NULL;
/* ( RI-sequence | Hangul-Syllable | !Control ) */
/* !Control */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Control"), 1, 0, env);
if (r != 0) goto err;
BITSET_CLEAR_BIT(cc->bs, 0x0a);
BITSET_CLEAR_BIT(cc->bs, 0x0d);
tmp = onig_node_new_alt(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* Hangul-Syllable
* := L* V+ T*
* | L* LV V* T*
* | L* LVT T*
* | L+
* | T+ */
/* T+ */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* L+ */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* L* LVT T* */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=LVT"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* L* LV V* T* */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=V"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=LV"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* L* V+ T* */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=V"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* Emoji sequence := (E_Base | EBG) Extend* E_Modifier?
* (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */
/* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Modifier"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, 1, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, extend, 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, NULL_NODE);
if (IS_NULL(tmp)) goto err;
alt2 = tmp;
list2 = NULL;
/* Glue_After_Zwj */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, extend, 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x1F308, 0x1F308);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F33E, 0x1F33E);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F373, 0x1F373);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F393, 0x1F393);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3A4, 0x1F3A4);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3A8, 0x1F3A8);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3EB, 0x1F3EB);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3ED, 0x1F3ED);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F4BB, 0x1F4BC);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F527, 0x1F527);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F52C, 0x1F52C);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F680, 0x1F680);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F692, 0x1F692);
if (r != 0) goto err;
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Glue_After_Zwj"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt2);
if (IS_NULL(tmp)) goto err;
alt2 = tmp;
list2 = NULL;
/* Emoji variation sequence
* http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt
*/
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0xfe0f, 0xfe0f);
if (r != 0) goto err;
tmp = node_new_quantifier(0, 1, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x2640, 0x2640);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x2642, 0x2642);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x2695, 0x2696);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x2708, 0x2708);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt2);
if (IS_NULL(tmp)) goto err;
alt2 = tmp;
list2 = NULL;
tmp = node_new_list(alt2, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
alt2 = NULL;
/* ZWJ */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = list2;
np1 = tmp;
list2 = NULL;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
/* E_Modifier? */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Modifier"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, 1, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
/* Extend* */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, extend, 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
/* (E_Base | EBG) */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x1F3C2, 0x1F3C2);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3C7, 0x1F3C7);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3CC, 0x1F3CC);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F3F3, 0x1F3F3);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F441, 0x1F441);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F46F, 0x1F46F);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F574, 0x1F574);
if (r != 0) goto err;
r = add_code_range(&(cc->mbuf), env, 0x1F6CC, 0x1F6CC);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base"), 0, 0, env);
if (r != 0) goto err;
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
/* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest
* has such examples.
* http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
*/
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Modifier"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, 1, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Glue_After_Zwj"), 0, 0, env);
if (r != 0) goto err;
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D);
if (r != 0) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* RI-Sequence := Regional_Indicator{2} */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x1F1E6, 0x1F1FF);
if (r != 0) goto err;
tmp = node_new_quantifier(2, 2, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
tmp = node_new_list(alt, list);
if (IS_NULL(tmp)) goto err;
list = tmp;
alt = NULL;
/* Prepend* */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Prepend"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list);
if (IS_NULL(tmp)) goto err;
list = tmp;
np1 = NULL;
/* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
np1 = node_new_anychar();
if (IS_NULL(np1)) goto err;
option = env->option;
ONOFF(option, ONIG_OPTION_MULTILINE, 0);
tmp = node_new_option(option);
if (IS_NULL(tmp)) goto err;
NENCLOSE(tmp)->target = np1;
np1 = tmp;
tmp = onig_node_new_alt(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* Prepend+ */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D);
if (r != 0) goto err;
tmp = node_new_quantifier(0, 1, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Prepend"), 0, 0, env);
if (r != 0) goto err;
tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
if (IS_NULL(tmp)) goto err;
NQTFR(tmp)->target = np1;
np1 = tmp;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
tmp = onig_node_new_alt(list, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list = NULL;
/* \x0D\x0A */
num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
if (num1 < 0) return num1;
num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
if (num2 < 0) return num2;
np1 = node_new_str_raw(buf, buf + num1 + num2);
if (IS_NULL(np1)) goto err;
tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* (?>...) */
*np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
if (IS_NULL(*np)) goto err;
NENCLOSE(*np)->target = alt;
return ONIG_NORMAL;
}
#endif /* USE_UNICODE_PROPERTIES */
if (IS_NULL(*np)) {
/* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
OnigOptionType option;
np1 = node_new_anychar();
if (IS_NULL(np1)) goto err;
option = env->option;
ONOFF(option, ONIG_OPTION_MULTILINE, 0);
*np = node_new_option(option);
if (IS_NULL(*np)) goto err;
NENCLOSE(*np)->target = np1;
}
return ONIG_NORMAL;
err:
onig_node_free(np1);
onig_node_free(list);
onig_node_free(list2);
onig_node_free(alt);
onig_node_free(alt2);
return (r == 0) ? ONIGERR_MEMORY : r;
}
static int
countbits(unsigned int bits)
{
bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
}
static int
is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
{
const OnigCodePoint not_found = ONIG_LAST_CODE_POINT;
OnigCodePoint c = not_found;
int i;
BBuf *bbuf = cc->mbuf;
if (IS_NCCLASS_NOT(cc)) return 0;
/* check bbuf */
if (IS_NOT_NULL(bbuf)) {
OnigCodePoint n, *data;
GET_CODE_POINT(n, bbuf->p);
data = (OnigCodePoint* )(bbuf->p) + 1;
if ((n == 1) && (data[0] == data[1])) {
/* only one char found in the bbuf, save the code point. */
c = data[0];
if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) {
/* skip if c is included in the bitset */
c = not_found;
}
}
else {
return 0; /* the bbuf contains multiple chars */
}
}
/* check bitset */
for (i = 0; i < BITSET_SIZE; i++) {
Bits b1 = cc->bs[i];
if (b1 != 0) {
if (((b1 & (b1 - 1)) == 0) && (c == not_found)) {
c = BITS_IN_ROOM * i + countbits(b1 - 1);
} else {
return 0; /* the character class contains multiple chars */
}
}
}
if (c != not_found) {
*code = c;
return 1;
}
/* the character class contains no char. */
return 0;
}
static int
parse_exp(Node** np, OnigToken* tok, int term,
UChar** src, UChar* end, ScanEnv* env)
{
int r, len, group = 0;
Node* qn;
Node** targetp;
*np = NULL;
if (tok->type == (enum TokenSyms )term)
goto end_of_token;
switch (tok->type) {
case TK_ALT:
case TK_EOT:
end_of_token:
*np = node_new_empty();
return tok->type;
break;
case TK_SUBEXP_OPEN:
r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
if (r < 0) return r;
if (r == 1) group = 1;
else if (r == 2) { /* option only */
Node* target;
OnigOptionType prev = env->option;
env->option = NENCLOSE(*np)->option;
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
r = parse_subexp(&target, tok, term, src, end, env);
env->option = prev;
if (r < 0) {
onig_node_free(target);
return r;
}
NENCLOSE(*np)->target = target;
return tok->type;
}
break;
case TK_SUBEXP_CLOSE:
if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
if (tok->escaped) goto tk_raw_byte;
else goto tk_byte;
break;
case TK_LINEBREAK:
r = node_linebreak(np, env);
if (r < 0) return r;
break;
case TK_EXTENDED_GRAPHEME_CLUSTER:
r = node_extended_grapheme_cluster(np, env);
if (r < 0) return r;
break;
case TK_KEEP:
*np = onig_node_new_anchor(ANCHOR_KEEP);
CHECK_NULL_RETURN_MEMERR(*np);
break;
case TK_STRING:
tk_byte:
{
*np = node_new_str(tok->backp, *src);
CHECK_NULL_RETURN_MEMERR(*np);
string_loop:
while (1) {
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
if (r == TK_STRING) {
r = onig_node_str_cat(*np, tok->backp, *src);
}
#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
else if (r == TK_CODE_POINT) {
r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
}
#endif
else {
break;
}
if (r < 0) return r;
}
string_end:
targetp = np;
goto repeat;
}
break;
case TK_RAW_BYTE:
tk_raw_byte:
{
*np = node_new_str_raw_char((UChar )tok->u.c);
CHECK_NULL_RETURN_MEMERR(*np);
len = 1;
while (1) {
if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) {
r = fetch_token(tok, src, end, env);
NSTRING_CLEAR_RAW(*np);
goto string_end;
}
}
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
if (r != TK_RAW_BYTE) {
/* Don't use this, it is wrong for little endian encodings. */
#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
int rem;
if (len < ONIGENC_MBC_MINLEN(env->enc)) {
rem = ONIGENC_MBC_MINLEN(env->enc) - len;
(void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
NSTRING_CLEAR_RAW(*np);
goto string_end;
}
}
#endif
return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
}
r = node_str_cat_char(*np, (UChar )tok->u.c);
if (r < 0) return r;
len++;
}
}
break;
case TK_CODE_POINT:
{
*np = node_new_empty();
CHECK_NULL_RETURN_MEMERR(*np);
r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
if (r != 0) return r;
#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
NSTRING_SET_RAW(*np);
#else
goto string_loop;
#endif
}
break;
case TK_QUOTE_OPEN:
{
OnigCodePoint end_op[2];
UChar *qstart, *qend, *nextp;
end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
end_op[1] = (OnigCodePoint )'E';
qstart = *src;
qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
if (IS_NULL(qend)) {
nextp = qend = end;
}
*np = node_new_str(qstart, qend);
CHECK_NULL_RETURN_MEMERR(*np);
*src = nextp;
}
break;
case TK_CHAR_TYPE:
{
switch (tok->u.prop.ctype) {
case ONIGENC_CTYPE_WORD:
*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
IS_ASCII_RANGE(env->option));
CHECK_NULL_RETURN_MEMERR(*np);
break;
case ONIGENC_CTYPE_SPACE:
case ONIGENC_CTYPE_DIGIT:
case ONIGENC_CTYPE_XDIGIT:
{
CClassNode* cc;
#ifdef USE_SHARED_CCLASS_TABLE
const OnigCodePoint *mbr;
OnigCodePoint sb_out;
r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
&sb_out, &mbr);
if (r == 0 &&
! IS_ASCII_RANGE(env->option) &&
ONIGENC_CODE_RANGE_NUM(mbr)
>= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
type_cclass_key key;
type_cclass_key* new_key;
key.enc = env->enc;
key.not = tok->u.prop.not;
key.type = tok->u.prop.ctype;
THREAD_ATOMIC_START;
if (IS_NULL(OnigTypeCClassTable)) {
OnigTypeCClassTable
= onig_st_init_table_with_size(&type_type_cclass_hash, 10);
if (IS_NULL(OnigTypeCClassTable)) {
THREAD_ATOMIC_END;
return ONIGERR_MEMORY;
}
}
else {
if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
(st_data_t* )np)) {
THREAD_ATOMIC_END;
break;
}
}
*np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
sb_out, mbr);
if (IS_NULL(*np)) {
THREAD_ATOMIC_END;
return ONIGERR_MEMORY;
}
cc = NCCLASS(*np);
NCCLASS_SET_SHARE(cc);
new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
xmemcpy(new_key, &key, sizeof(type_cclass_key));
onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
(st_data_t )*np);
THREAD_ATOMIC_END;
}
else {
#endif
*np = node_new_cclass();
CHECK_NULL_RETURN_MEMERR(*np);
cc = NCCLASS(*np);
r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0,
IS_ASCII_RANGE(env->option), env);
if (r != 0) return r;
if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
#ifdef USE_SHARED_CCLASS_TABLE
}
#endif
}
break;
default:
return ONIGERR_PARSER_BUG;
break;
}
}
break;
case TK_CHAR_PROPERTY:
r = parse_char_property(np, tok, src, end, env);
if (r != 0) return r;
break;
case TK_CC_OPEN:
{
Node *asc_node;
CClassNode* cc;
OnigCodePoint code;
r = parse_char_class(np, &asc_node, tok, src, end, env);
if (r != 0) {
onig_node_free(asc_node);
return r;
}
cc = NCCLASS(*np);
if (is_onechar_cclass(cc, &code)) {
onig_node_free(*np);
onig_node_free(asc_node);
*np = node_new_empty();
CHECK_NULL_RETURN_MEMERR(*np);
r = node_str_cat_codepoint(*np, env->enc, code);
if (r != 0) return r;
goto string_loop;
}
if (IS_IGNORECASE(env->option)) {
r = cclass_case_fold(np, cc, NCCLASS(asc_node), env);
if (r != 0) {
onig_node_free(asc_node);
return r;
}
}
onig_node_free(asc_node);
}
break;
case TK_ANYCHAR:
*np = node_new_anychar();
CHECK_NULL_RETURN_MEMERR(*np);
break;
case TK_ANYCHAR_ANYTIME:
*np = node_new_anychar();
CHECK_NULL_RETURN_MEMERR(*np);
qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
CHECK_NULL_RETURN_MEMERR(qn);
NQTFR(qn)->target = *np;
*np = qn;
break;
case TK_BACKREF:
len = tok->u.backref.num;
*np = node_new_backref(len,
(len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
tok->u.backref.by_name,
#ifdef USE_BACKREF_WITH_LEVEL
tok->u.backref.exist_level,
tok->u.backref.level,
#endif
env);
CHECK_NULL_RETURN_MEMERR(*np);
break;
#ifdef USE_SUBEXP_CALL
case TK_CALL:
{
int gnum = tok->u.call.gnum;
if (gnum < 0 || tok->u.call.rel != 0) {
if (gnum > 0) gnum--;
gnum = BACKREF_REL_TO_ABS(gnum, env);
if (gnum <= 0)
return ONIGERR_INVALID_BACKREF;
}
*np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
CHECK_NULL_RETURN_MEMERR(*np);
env->num_call++;
}
break;
#endif
case TK_ANCHOR:
*np = onig_node_new_anchor(tok->u.anchor.subtype);
CHECK_NULL_RETURN_MEMERR(*np);
NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
break;
case TK_OP_REPEAT:
case TK_INTERVAL:
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
else
*np = node_new_empty();
}
else {
goto tk_byte;
}
break;
default:
return ONIGERR_PARSER_BUG;
break;
}
{
targetp = np;
re_entry:
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
repeat:
if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
if (is_invalid_quantifier_target(*targetp))
return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
(r == TK_INTERVAL ? 1 : 0));
CHECK_NULL_RETURN_MEMERR(qn);
NQTFR(qn)->greedy = tok->u.repeat.greedy;
r = set_quantifier(qn, *targetp, group, env);
if (r < 0) {
onig_node_free(qn);
return r;
}
if (tok->u.repeat.possessive != 0) {
Node* en;
en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
if (IS_NULL(en)) {
onig_node_free(qn);
return ONIGERR_MEMORY;
}
NENCLOSE(en)->target = qn;
qn = en;
}
if (r == 0) {
*targetp = qn;
}
else if (r == 1) {
onig_node_free(qn);
}
else if (r == 2) { /* split case: /abc+/ */
Node *tmp;
*targetp = node_new_list(*targetp, NULL);
if (IS_NULL(*targetp)) {
onig_node_free(qn);
return ONIGERR_MEMORY;
}
tmp = NCDR(*targetp) = node_new_list(qn, NULL);
if (IS_NULL(tmp)) {
onig_node_free(qn);
return ONIGERR_MEMORY;
}
targetp = &(NCAR(tmp));
}
goto re_entry;
}
}
return r;
}
static int
parse_branch(Node** top, OnigToken* tok, int term,
UChar** src, UChar* end, ScanEnv* env)
{
int r;
Node *node, **headp;
*top = NULL;
r = parse_exp(&node, tok, term, src, end, env);
if (r < 0) {
onig_node_free(node);
return r;
}
if (r == TK_EOT || r == term || r == TK_ALT) {
*top = node;
}
else {
*top = node_new_list(node, NULL);
headp = &(NCDR(*top));
while (r != TK_EOT && r != term && r != TK_ALT) {
r = parse_exp(&node, tok, term, src, end, env);
if (r < 0) {
onig_node_free(node);
return r;
}
if (NTYPE(node) == NT_LIST) {
*headp = node;
while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
headp = &(NCDR(node));
}
else {
*headp = node_new_list(node, NULL);
headp = &(NCDR(*headp));
}
}
}
return r;
}
/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
static int
parse_subexp(Node** top, OnigToken* tok, int term,
UChar** src, UChar* end, ScanEnv* env)
{
int r;
Node *node, **headp;
*top = NULL;
r = parse_branch(&node, tok, term, src, end, env);
if (r < 0) {
onig_node_free(node);
return r;
}
if (r == term) {
*top = node;
}
else if (r == TK_ALT) {
*top = onig_node_new_alt(node, NULL);
headp = &(NCDR(*top));
while (r == TK_ALT) {
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
r = parse_branch(&node, tok, term, src, end, env);
if (r < 0) {
onig_node_free(node);
return r;
}
*headp = onig_node_new_alt(node, NULL);
headp = &(NCDR(*headp));
}
if (tok->type != (enum TokenSyms )term)
goto err;
}
else {
onig_node_free(node);
err:
if (term == TK_SUBEXP_CLOSE)
return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
else
return ONIGERR_PARSER_BUG;
}
return r;
}
static int
parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
{
int r;
OnigToken tok;
r = fetch_token(&tok, src, end, env);
if (r < 0) return r;
r = parse_subexp(top, &tok, TK_EOT, src, end, env);
if (r < 0) return r;
#ifdef USE_SUBEXP_CALL
if (env->num_call > 0) {
/* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */
const int num = 0;
Node* np;
np = node_new_enclose_memory(env->option, 0);
CHECK_NULL_RETURN_MEMERR(np);
NENCLOSE(np)->regnum = num;
NENCLOSE(np)->target = *top;
r = scan_env_set_mem_node(env, num, np);
if (r != 0) {
onig_node_free(np);
return r;
}
*top = np;
}
#endif
return 0;
}
extern int
onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
regex_t* reg, ScanEnv* env)
{
int r;
UChar* p;
#ifdef USE_NAMED_GROUP
names_clear(reg);
#endif
scan_env_clear(env);
env->option = reg->options;
env->case_fold_flag = reg->case_fold_flag;
env->enc = reg->enc;
env->syntax = reg->syntax;
env->pattern = (UChar* )pattern;
env->pattern_end = (UChar* )end;
env->reg = reg;
*root = NULL;
p = (UChar* )pattern;
r = parse_regexp(root, &p, (UChar* )end, env);
reg->num_mem = env->num_mem;
return r;
}
extern void
onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
UChar* arg, UChar* arg_end)
{
env->error = arg;
env->error_end = arg_end;
}