1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/regcomp.c
naruse c11e648799 Regexp supports Unicoe 9.0.0's \X
* meta character \X matches Unicode 9.0.0 characters with some workarounds
  for UTR #51 Unicode Emoji, Version 4.0 emoji zwj sequences.
  [Feature #12831] [ruby-core:77586]

The term "character" can have many meanings bytes, codepoints, combined
characters, and so on. "grapheme cluster" is highest one of such words,
which means user-perceived characters.
Unicode Standard Annex #29 UNICODE TEXT SEGMENTATION specifies how to
handle grapheme clusters (extended grapheme cluster).
But some specs aren't updated to current situation because Unicode Emoji
is rapidly extended without well definition.
It breaks the precondition of UTR#29 "Grapheme cluster boundaries can be
easily tested by looking at immediately adjacent characters". (the
sentence will be removed in the next version)
Though some of its detail are described in Unicode Technical Report #51
UNICODE EMOJI but it is not merged into UTR#29 yet.

http://unicode.org/reports/tr29/
http://unicode.org/reports/tr51/
http://unicode.org/Public/emoji/4.0/

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@56949 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2016-11-30 17:29:19 +00:00

6756 lines
156 KiB
C

/**********************************************************************
regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* Copyright (c) 2011-2014 K.Takata <kentkt AT csc DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regparse.h"
#if defined(USE_MULTI_THREAD_SYSTEM) \
&& defined(USE_DEFAULT_MULTI_THREAD_SYSTEM)
#ifdef _WIN32
CRITICAL_SECTION gOnigMutex;
#else
pthread_mutex_t gOnigMutex;
#endif
#endif
OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)
{
return OnigDefaultCaseFoldFlag;
}
extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
{
OnigDefaultCaseFoldFlag = case_fold_flag;
return 0;
}
#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
#endif
#if 0
static UChar*
str_dup(UChar* s, UChar* end)
{
ptrdiff_t len = end - s;
if (len > 0) {
UChar* r = (UChar* )xmalloc(len + 1);
CHECK_NULL_RETURN(r);
xmemcpy(r, s, len);
r[len] = (UChar )0;
return r;
}
else return NULL;
}
#endif
static void
swap_node(Node* a, Node* b)
{
Node c;
c = *a; *a = *b; *b = c;
if (NTYPE(a) == NT_STR) {
StrNode* sn = NSTR(a);
if (sn->capa == 0) {
size_t len = sn->end - sn->s;
sn->s = sn->buf;
sn->end = sn->s + len;
}
}
if (NTYPE(b) == NT_STR) {
StrNode* sn = NSTR(b);
if (sn->capa == 0) {
size_t len = sn->end - sn->s;
sn->s = sn->buf;
sn->end = sn->s + len;
}
}
}
static OnigDistance
distance_add(OnigDistance d1, OnigDistance d2)
{
if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
return ONIG_INFINITE_DISTANCE;
else {
if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
else return ONIG_INFINITE_DISTANCE;
}
}
static OnigDistance
distance_multiply(OnigDistance d, int m)
{
if (m == 0) return 0;
if (d < ONIG_INFINITE_DISTANCE / m)
return d * m;
else
return ONIG_INFINITE_DISTANCE;
}
static int
bitset_is_empty(BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) {
if (bs[i] != 0) return 0;
}
return 1;
}
#ifdef ONIG_DEBUG
static int
bitset_on_num(BitSetRef bs)
{
int i, n;
n = 0;
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
if (BITSET_AT(bs, i)) n++;
}
return n;
}
#endif
extern int
onig_bbuf_init(BBuf* buf, OnigDistance size)
{
if (size <= 0) {
size = 0;
buf->p = NULL;
}
else {
buf->p = (UChar* )xmalloc(size);
if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
}
buf->alloc = (unsigned int )size;
buf->used = 0;
return 0;
}
#ifdef USE_SUBEXP_CALL
static int
unset_addr_list_init(UnsetAddrList* uslist, int size)
{
UnsetAddr* p;
p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
CHECK_NULL_RETURN_MEMERR(p);
uslist->num = 0;
uslist->alloc = size;
uslist->us = p;
return 0;
}
static void
unset_addr_list_end(UnsetAddrList* uslist)
{
if (IS_NOT_NULL(uslist->us))
xfree(uslist->us);
}
static int
unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
{
UnsetAddr* p;
int size;
if (uslist->num >= uslist->alloc) {
size = uslist->alloc * 2;
p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
CHECK_NULL_RETURN_MEMERR(p);
uslist->alloc = size;
uslist->us = p;
}
uslist->us[uslist->num].offset = offset;
uslist->us[uslist->num].target = node;
uslist->num++;
return 0;
}
#endif /* USE_SUBEXP_CALL */
static int
add_opcode(regex_t* reg, int opcode)
{
BBUF_ADD1(reg, opcode);
return 0;
}
#ifdef USE_COMBINATION_EXPLOSION_CHECK
static int
add_state_check_num(regex_t* reg, int num)
{
StateCheckNumType n = (StateCheckNumType )num;
BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
return 0;
}
#endif
static int
add_rel_addr(regex_t* reg, int addr)
{
RelAddrType ra = (RelAddrType )addr;
BBUF_ADD(reg, &ra, SIZE_RELADDR);
return 0;
}
static int
add_abs_addr(regex_t* reg, int addr)
{
AbsAddrType ra = (AbsAddrType )addr;
BBUF_ADD(reg, &ra, SIZE_ABSADDR);
return 0;
}
static int
add_length(regex_t* reg, OnigDistance len)
{
LengthType l = (LengthType )len;
BBUF_ADD(reg, &l, SIZE_LENGTH);
return 0;
}
static int
add_mem_num(regex_t* reg, int num)
{
MemNumType n = (MemNumType )num;
BBUF_ADD(reg, &n, SIZE_MEMNUM);
return 0;
}
static int
add_pointer(regex_t* reg, void* addr)
{
PointerType ptr = (PointerType )addr;
BBUF_ADD(reg, &ptr, SIZE_POINTER);
return 0;
}
static int
add_option(regex_t* reg, OnigOptionType option)
{
BBUF_ADD(reg, &option, SIZE_OPTION);
return 0;
}
static int
add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
{
int r;
r = add_opcode(reg, opcode);
if (r) return r;
r = add_rel_addr(reg, addr);
return r;
}
static int
add_bytes(regex_t* reg, UChar* bytes, OnigDistance len)
{
BBUF_ADD(reg, bytes, len);
return 0;
}
static int
add_bitset(regex_t* reg, BitSetRef bs)
{
BBUF_ADD(reg, bs, SIZE_BITSET);
return 0;
}
static int
add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
{
int r;
r = add_opcode(reg, opcode);
if (r) return r;
r = add_option(reg, option);
return r;
}
static int compile_length_tree(Node* node, regex_t* reg);
static int compile_tree(Node* node, regex_t* reg);
#define IS_NEED_STR_LEN_OP_EXACT(op) \
((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
(op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
static int
select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case)
{
int op;
OnigDistance str_len = (byte_len + mb_len - 1) / mb_len;
if (ignore_case) {
switch (str_len) {
case 1: op = OP_EXACT1_IC; break;
default: op = OP_EXACTN_IC; break;
}
}
else {
switch (mb_len) {
case 1:
switch (str_len) {
case 1: op = OP_EXACT1; break;
case 2: op = OP_EXACT2; break;
case 3: op = OP_EXACT3; break;
case 4: op = OP_EXACT4; break;
case 5: op = OP_EXACT5; break;
default: op = OP_EXACTN; break;
}
break;
case 2:
switch (str_len) {
case 1: op = OP_EXACTMB2N1; break;
case 2: op = OP_EXACTMB2N2; break;
case 3: op = OP_EXACTMB2N3; break;
default: op = OP_EXACTMB2N; break;
}
break;
case 3:
op = OP_EXACTMB3N;
break;
default:
op = OP_EXACTMBN;
break;
}
}
return op;
}
static int
compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
{
int r;
int saved_num_null_check = reg->num_null_check;
if (empty_info != 0) {
r = add_opcode(reg, OP_NULL_CHECK_START);
if (r) return r;
r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
if (r) return r;
reg->num_null_check++;
}
r = compile_tree(node, reg);
if (r) return r;
if (empty_info != 0) {
if (empty_info == NQ_TARGET_IS_EMPTY)
r = add_opcode(reg, OP_NULL_CHECK_END);
else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
if (r) return r;
r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
}
return r;
}
#ifdef USE_SUBEXP_CALL
static int
compile_call(CallNode* node, regex_t* reg)
{
int r;
r = add_opcode(reg, OP_CALL);
if (r) return r;
r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
node->target);
if (r) return r;
r = add_abs_addr(reg, 0 /*dummy addr.*/);
return r;
}
#endif
static int
compile_tree_n_times(Node* node, int n, regex_t* reg)
{
int i, r;
for (i = 0; i < n; i++) {
r = compile_tree(node, reg);
if (r) return r;
}
return 0;
}
static int
add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len,
regex_t* reg ARG_UNUSED, int ignore_case)
{
int len;
int op = select_str_opcode(mb_len, byte_len, ignore_case);
len = SIZE_OPCODE;
if (op == OP_EXACTMBN) len += SIZE_LENGTH;
if (IS_NEED_STR_LEN_OP_EXACT(op))
len += SIZE_LENGTH;
len += (int )byte_len;
return len;
}
static int
add_compile_string(UChar* s, int mb_len, OnigDistance byte_len,
regex_t* reg, int ignore_case)
{
int op = select_str_opcode(mb_len, byte_len, ignore_case);
add_opcode(reg, op);
if (op == OP_EXACTMBN)
add_length(reg, mb_len);
if (IS_NEED_STR_LEN_OP_EXACT(op)) {
if (op == OP_EXACTN_IC)
add_length(reg, byte_len);
else
add_length(reg, byte_len / mb_len);
}
add_bytes(reg, s, byte_len);
return 0;
}
static int
compile_length_string_node(Node* node, regex_t* reg)
{
int rlen, r, len, prev_len, blen, ambig;
OnigEncoding enc = reg->enc;
UChar *p, *prev;
StrNode* sn;
sn = NSTR(node);
if (sn->end <= sn->s)
return 0;
ambig = NSTRING_IS_AMBIG(node);
p = prev = sn->s;
prev_len = enclen(enc, p, sn->end);
p += prev_len;
blen = prev_len;
rlen = 0;
for (; p < sn->end; ) {
len = enclen(enc, p, sn->end);
if (len == prev_len || ambig) {
blen += len;
}
else {
r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
rlen += r;
prev = p;
blen = len;
prev_len = len;
}
p += len;
}
r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
rlen += r;
return rlen;
}
static int
compile_length_string_raw_node(StrNode* sn, regex_t* reg)
{
if (sn->end <= sn->s)
return 0;
return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
}
static int
compile_string_node(Node* node, regex_t* reg)
{
int r, len, prev_len, blen, ambig;
OnigEncoding enc = reg->enc;
UChar *p, *prev, *end;
StrNode* sn;
sn = NSTR(node);
if (sn->end <= sn->s)
return 0;
end = sn->end;
ambig = NSTRING_IS_AMBIG(node);
p = prev = sn->s;
prev_len = enclen(enc, p, end);
p += prev_len;
blen = prev_len;
for (; p < end; ) {
len = enclen(enc, p, end);
if (len == prev_len || ambig) {
blen += len;
}
else {
r = add_compile_string(prev, prev_len, blen, reg, ambig);
if (r) return r;
prev = p;
blen = len;
prev_len = len;
}
p += len;
}
return add_compile_string(prev, prev_len, blen, reg, ambig);
}
static int
compile_string_raw_node(StrNode* sn, regex_t* reg)
{
if (sn->end <= sn->s)
return 0;
return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
}
static int
add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
{
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
add_length(reg, mbuf->used);
return add_bytes(reg, mbuf->p, mbuf->used);
#else
int r, pad_size;
UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
GET_ALIGNMENT_PAD_SIZE(p, pad_size);
add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
r = add_bytes(reg, mbuf->p, mbuf->used);
/* padding for return value from compile_length_cclass_node() to be fix. */
pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
return r;
#endif
}
static int
compile_length_cclass_node(CClassNode* cc, regex_t* reg)
{
int len;
if (IS_NCCLASS_SHARE(cc)) {
len = SIZE_OPCODE + SIZE_POINTER;
return len;
}
if (IS_NULL(cc->mbuf)) {
len = SIZE_OPCODE + SIZE_BITSET;
}
else {
if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
len = SIZE_OPCODE;
}
else {
len = SIZE_OPCODE + SIZE_BITSET;
}
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
len += SIZE_LENGTH + cc->mbuf->used;
#else
len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
#endif
}
return len;
}
static int
compile_cclass_node(CClassNode* cc, regex_t* reg)
{
int r;
if (IS_NCCLASS_SHARE(cc)) {
add_opcode(reg, OP_CCLASS_NODE);
r = add_pointer(reg, cc);
return r;
}
if (IS_NULL(cc->mbuf)) {
if (IS_NCCLASS_NOT(cc))
add_opcode(reg, OP_CCLASS_NOT);
else
add_opcode(reg, OP_CCLASS);
r = add_bitset(reg, cc->bs);
}
else {
if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
if (IS_NCCLASS_NOT(cc))
add_opcode(reg, OP_CCLASS_MB_NOT);
else
add_opcode(reg, OP_CCLASS_MB);
r = add_multi_byte_cclass(cc->mbuf, reg);
}
else {
if (IS_NCCLASS_NOT(cc))
add_opcode(reg, OP_CCLASS_MIX_NOT);
else
add_opcode(reg, OP_CCLASS_MIX);
r = add_bitset(reg, cc->bs);
if (r) return r;
r = add_multi_byte_cclass(cc->mbuf, reg);
}
}
return r;
}
static int
entry_repeat_range(regex_t* reg, int id, int lower, int upper)
{
#define REPEAT_RANGE_ALLOC 4
OnigRepeatRange* p;
if (reg->repeat_range_alloc == 0) {
p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
CHECK_NULL_RETURN_MEMERR(p);
reg->repeat_range = p;
reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
}
else if (reg->repeat_range_alloc <= id) {
int n;
n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
sizeof(OnigRepeatRange) * n);
CHECK_NULL_RETURN_MEMERR(p);
reg->repeat_range = p;
reg->repeat_range_alloc = n;
}
else {
p = reg->repeat_range;
}
p[id].lower = lower;
p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
return 0;
}
static int
compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
regex_t* reg)
{
int r;
int num_repeat = reg->num_repeat;
r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
if (r) return r;
r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
reg->num_repeat++;
if (r) return r;
r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
if (r) return r;
r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
if (
#ifdef USE_SUBEXP_CALL
reg->num_call > 0 ||
#endif
IS_QUANTIFIER_IN_REPEAT(qn)) {
r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
}
else {
r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
}
if (r) return r;
r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
return r;
}
static int
is_anychar_star_quantifier(QtfrNode* qn)
{
if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
NTYPE(qn->target) == NT_CANY)
return 1;
else
return 0;
}
#define QUANTIFIER_EXPAND_LIMIT_SIZE 50
#define CKN_ON (ckn > 0)
#ifdef USE_COMBINATION_EXPLOSION_CHECK
static int
compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
{
int len, mod_tlen, cklen;
int ckn;
int infinite = IS_REPEAT_INFINITE(qn->upper);
int empty_info = qn->target_empty_info;
int tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
/* anychar repeat */
if (NTYPE(qn->target) == NT_CANY) {
if (qn->greedy && infinite) {
if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
else
return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
}
}
if (empty_info != 0)
mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
else
mod_tlen = tlen;
if (infinite && qn->lower <= 1) {
if (qn->greedy) {
if (qn->lower == 1)
len = SIZE_OP_JUMP;
else
len = 0;
len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
}
else {
if (qn->lower == 0)
len = SIZE_OP_JUMP;
else
len = 0;
len += mod_tlen + SIZE_OP_PUSH + cklen;
}
}
else if (qn->upper == 0) {
if (qn->is_refered != 0) /* /(?<n>..){0}/ */
len = SIZE_OP_JUMP + tlen;
else
len = 0;
}
else if (qn->upper == 1 && qn->greedy) {
if (qn->lower == 0) {
if (CKN_ON) {
len = SIZE_OP_STATE_CHECK_PUSH + tlen;
}
else {
len = SIZE_OP_PUSH + tlen;
}
}
else {
len = tlen;
}
}
else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
}
else {
len = SIZE_OP_REPEAT_INC
+ mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
if (CKN_ON)
len += SIZE_OP_STATE_CHECK;
}
return len;
}
static int
compile_quantifier_node(QtfrNode* qn, regex_t* reg)
{
int r, mod_tlen;
int ckn;
int infinite = IS_REPEAT_INFINITE(qn->upper);
int empty_info = qn->target_empty_info;
int tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
if (is_anychar_star_quantifier(qn)) {
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
if (IS_MULTILINE(reg->options))
r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
else
r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
if (r) return r;
if (CKN_ON) {
r = add_state_check_num(reg, ckn);
if (r) return r;
}
return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
}
else {
if (IS_MULTILINE(reg->options)) {
r = add_opcode(reg, (CKN_ON ?
OP_STATE_CHECK_ANYCHAR_ML_STAR
: OP_ANYCHAR_ML_STAR));
}
else {
r = add_opcode(reg, (CKN_ON ?
OP_STATE_CHECK_ANYCHAR_STAR
: OP_ANYCHAR_STAR));
}
if (r) return r;
if (CKN_ON)
r = add_state_check_num(reg, ckn);
return r;
}
}
if (empty_info != 0)
mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
else
mod_tlen = tlen;
if (infinite && qn->lower <= 1) {
if (qn->greedy) {
if (qn->lower == 1) {
r = add_opcode_rel_addr(reg, OP_JUMP,
(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
if (r) return r;
}
if (CKN_ON) {
r = add_opcode(reg, OP_STATE_CHECK_PUSH);
if (r) return r;
r = add_state_check_num(reg, ckn);
if (r) return r;
r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
}
else {
r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
}
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP
+ (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
}
else {
if (qn->lower == 0) {
r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
if (r) return r;
}
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
if (CKN_ON) {
r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
if (r) return r;
r = add_state_check_num(reg, ckn);
if (r) return r;
r = add_rel_addr(reg,
-(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
}
else
r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
}
}
else if (qn->upper == 0) {
if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
if (r) return r;
r = compile_tree(qn->target, reg);
}
else
r = 0;
}
else if (qn->upper == 1 && qn->greedy) {
if (qn->lower == 0) {
if (CKN_ON) {
r = add_opcode(reg, OP_STATE_CHECK_PUSH);
if (r) return r;
r = add_state_check_num(reg, ckn);
if (r) return r;
r = add_rel_addr(reg, tlen);
}
else {
r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
}
if (r) return r;
}
r = compile_tree(qn->target, reg);
}
else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
if (CKN_ON) {
r = add_opcode(reg, OP_STATE_CHECK_PUSH);
if (r) return r;
r = add_state_check_num(reg, ckn);
if (r) return r;
r = add_rel_addr(reg, SIZE_OP_JUMP);
}
else {
r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
}
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
if (r) return r;
r = compile_tree(qn->target, reg);
}
else {
r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
if (CKN_ON) {
if (r) return r;
r = add_opcode(reg, OP_STATE_CHECK);
if (r) return r;
r = add_state_check_num(reg, ckn);
}
}
return r;
}
#else /* USE_COMBINATION_EXPLOSION_CHECK */
static int
compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
{
int len, mod_tlen;
int infinite = IS_REPEAT_INFINITE(qn->upper);
int empty_info = qn->target_empty_info;
int tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
/* anychar repeat */
if (NTYPE(qn->target) == NT_CANY) {
if (qn->greedy && infinite) {
if (IS_NOT_NULL(qn->next_head_exact))
return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
else
return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
}
}
if (empty_info != 0)
mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
else
mod_tlen = tlen;
if (infinite &&
(qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
len = SIZE_OP_JUMP;
}
else {
len = tlen * qn->lower;
}
if (qn->greedy) {
if (IS_NOT_NULL(qn->head_exact))
len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
else if (IS_NOT_NULL(qn->next_head_exact))
len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
else
len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
}
else
len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
}
else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
len = SIZE_OP_JUMP + tlen;
}
else if (!infinite && qn->greedy &&
(qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
<= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
len = tlen * qn->lower;
len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
}
else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
}
else {
len = SIZE_OP_REPEAT_INC
+ mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
}
return len;
}
static int
compile_quantifier_node(QtfrNode* qn, regex_t* reg)
{
int i, r, mod_tlen;
int infinite = IS_REPEAT_INFINITE(qn->upper);
int empty_info = qn->target_empty_info;
int tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
if (is_anychar_star_quantifier(qn)) {
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
if (IS_NOT_NULL(qn->next_head_exact)) {
if (IS_MULTILINE(reg->options))
r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
else
r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
if (r) return r;
return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
}
else {
if (IS_MULTILINE(reg->options))
return add_opcode(reg, OP_ANYCHAR_ML_STAR);
else
return add_opcode(reg, OP_ANYCHAR_STAR);
}
}
if (empty_info != 0)
mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
else
mod_tlen = tlen;
if (infinite &&
(qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
if (qn->greedy) {
if (IS_NOT_NULL(qn->head_exact))
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
else if (IS_NOT_NULL(qn->next_head_exact))
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
else
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
}
else {
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
}
if (r) return r;
}
else {
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
}
if (qn->greedy) {
if (IS_NOT_NULL(qn->head_exact)) {
r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
mod_tlen + SIZE_OP_JUMP);
if (r) return r;
add_bytes(reg, NSTR(qn->head_exact)->s, 1);
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
}
else if (IS_NOT_NULL(qn->next_head_exact)) {
r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
mod_tlen + SIZE_OP_JUMP);
if (r) return r;
add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
}
else {
r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
}
}
else {
r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
}
}
else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
if (r) return r;
r = compile_tree(qn->target, reg);
}
else if (!infinite && qn->greedy &&
(qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
<= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
int n = qn->upper - qn->lower;
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
for (i = 0; i < n; i++) {
r = add_opcode_rel_addr(reg, OP_PUSH,
(n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
if (r) return r;
r = compile_tree(qn->target, reg);
if (r) return r;
}
}
else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
if (r) return r;
r = compile_tree(qn->target, reg);
}
else {
r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
}
return r;
}
#endif /* USE_COMBINATION_EXPLOSION_CHECK */
static int
compile_length_option_node(EncloseNode* node, regex_t* reg)
{
int tlen;
OnigOptionType prev = reg->options;
reg->options = node->option;
tlen = compile_length_tree(node->target, reg);
reg->options = prev;
if (tlen < 0) return tlen;
if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
+ tlen + SIZE_OP_SET_OPTION;
}
else
return tlen;
}
static int
compile_option_node(EncloseNode* node, regex_t* reg)
{
int r;
OnigOptionType prev = reg->options;
if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
if (r) return r;
r = add_opcode_option(reg, OP_SET_OPTION, prev);
if (r) return r;
r = add_opcode(reg, OP_FAIL);
if (r) return r;
}
reg->options = node->option;
r = compile_tree(node->target, reg);
reg->options = prev;
if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
if (r) return r;
r = add_opcode_option(reg, OP_SET_OPTION, prev);
}
return r;
}
static int
compile_length_enclose_node(EncloseNode* node, regex_t* reg)
{
int len;
int tlen;
if (node->type == ENCLOSE_OPTION)
return compile_length_option_node(node, reg);
if (node->target) {
tlen = compile_length_tree(node->target, reg);
if (tlen < 0) return tlen;
}
else
tlen = 0;
switch (node->type) {
case ENCLOSE_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_ENCLOSE_CALLED(node)) {
len = SIZE_OP_MEMORY_START_PUSH + tlen
+ SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
len += (IS_ENCLOSE_RECURSION(node)
? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
else
len += (IS_ENCLOSE_RECURSION(node)
? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
}
else
#endif
{
if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
len = SIZE_OP_MEMORY_START_PUSH;
else
len = SIZE_OP_MEMORY_START;
len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
}
break;
case ENCLOSE_STOP_BACKTRACK:
if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
QtfrNode* qn = NQTFR(node->target);
tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
len = tlen * qn->lower
+ SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
}
else {
len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
}
break;
case ENCLOSE_CONDITION:
len = SIZE_OP_CONDITION;
if (NTYPE(node->target) == NT_ALT) {
Node* x = node->target;
tlen = compile_length_tree(NCAR(x), reg); /* yes-node */
if (tlen < 0) return tlen;
len += tlen + SIZE_OP_JUMP;
if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
x = NCDR(x);
tlen = compile_length_tree(NCAR(x), reg); /* no-node */
if (tlen < 0) return tlen;
len += tlen;
if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
}
else {
return ONIGERR_PARSER_BUG;
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return len;
}
static int get_char_length_tree(Node* node, regex_t* reg, int* len);
static int
compile_enclose_node(EncloseNode* node, regex_t* reg)
{
int r, len;
if (node->type == ENCLOSE_OPTION)
return compile_option_node(node, reg);
switch (node->type) {
case ENCLOSE_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_ENCLOSE_CALLED(node)) {
r = add_opcode(reg, OP_CALL);
if (r) return r;
node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
node->state |= NST_ADDR_FIXED;
r = add_abs_addr(reg, (int )node->call_addr);
if (r) return r;
len = compile_length_tree(node->target, reg);
len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
len += (IS_ENCLOSE_RECURSION(node)
? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
else
len += (IS_ENCLOSE_RECURSION(node)
? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
r = add_opcode_rel_addr(reg, OP_JUMP, len);
if (r) return r;
}
#endif
if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
r = add_opcode(reg, OP_MEMORY_START_PUSH);
else
r = add_opcode(reg, OP_MEMORY_START);
if (r) return r;
r = add_mem_num(reg, node->regnum);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
#ifdef USE_SUBEXP_CALL
if (IS_ENCLOSE_CALLED(node)) {
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
else
r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
? OP_MEMORY_END_REC : OP_MEMORY_END));
if (r) return r;
r = add_mem_num(reg, node->regnum);
if (r) return r;
r = add_opcode(reg, OP_RETURN);
}
else
#endif
{
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
r = add_opcode(reg, OP_MEMORY_END_PUSH);
else
r = add_opcode(reg, OP_MEMORY_END);
if (r) return r;
r = add_mem_num(reg, node->regnum);
}
break;
case ENCLOSE_STOP_BACKTRACK:
if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
QtfrNode* qn = NQTFR(node->target);
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
len = compile_length_tree(qn->target, reg);
if (len < 0) return len;
r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
if (r) return r;
r = compile_tree(qn->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
}
else {
r = add_opcode(reg, OP_PUSH_STOP_BT);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP_STOP_BT);
}
break;
case ENCLOSE_CONDITION:
r = add_opcode(reg, OP_CONDITION);
if (r) return r;
r = add_mem_num(reg, node->regnum);
if (r) return r;
if (NTYPE(node->target) == NT_ALT) {
Node* x = node->target;
int len2;
len = compile_length_tree(NCAR(x), reg); /* yes-node */
if (len < 0) return len;
if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
x = NCDR(x);
len2 = compile_length_tree(NCAR(x), reg); /* no-node */
if (len2 < 0) return len2;
if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
x = node->target;
r = add_rel_addr(reg, len + SIZE_OP_JUMP);
if (r) return r;
r = compile_tree(NCAR(x), reg); /* yes-node */
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP, len2);
if (r) return r;
x = NCDR(x);
r = compile_tree(NCAR(x), reg); /* no-node */
}
else {
return ONIGERR_PARSER_BUG;
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
compile_length_anchor_node(AnchorNode* node, regex_t* reg)
{
int len;
int tlen = 0;
if (node->target) {
tlen = compile_length_tree(node->target, reg);
if (tlen < 0) return tlen;
}
switch (node->type) {
case ANCHOR_PREC_READ:
len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
break;
case ANCHOR_PREC_READ_NOT:
len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
break;
case ANCHOR_LOOK_BEHIND:
len = SIZE_OP_LOOK_BEHIND + tlen;
break;
case ANCHOR_LOOK_BEHIND_NOT:
len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
break;
default:
len = SIZE_OPCODE;
break;
}
return len;
}
static int
compile_anchor_node(AnchorNode* node, regex_t* reg)
{
int r, len;
switch (node->type) {
case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
/* used for implicit anchor optimization: /.*a/ ==> /(?:^|\G).*a/ */
case ANCHOR_ANYCHAR_STAR: r = add_opcode(reg, OP_BEGIN_POS_OR_LINE); break;
case ANCHOR_WORD_BOUND:
if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
else r = add_opcode(reg, OP_WORD_BOUND);
break;
case ANCHOR_NOT_WORD_BOUND:
if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND);
else r = add_opcode(reg, OP_NOT_WORD_BOUND);
break;
#ifdef USE_WORD_BEGIN_END
case ANCHOR_WORD_BEGIN:
if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN);
else r = add_opcode(reg, OP_WORD_BEGIN);
break;
case ANCHOR_WORD_END:
if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END);
else r = add_opcode(reg, OP_WORD_END);
break;
#endif
case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break;
case ANCHOR_PREC_READ:
r = add_opcode(reg, OP_PUSH_POS);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP_POS);
break;
case ANCHOR_PREC_READ_NOT:
len = compile_length_tree(node->target, reg);
if (len < 0) return len;
r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_FAIL_POS);
break;
case ANCHOR_LOOK_BEHIND:
{
int n;
r = add_opcode(reg, OP_LOOK_BEHIND);
if (r) return r;
if (node->char_len < 0) {
r = get_char_length_tree(node->target, reg, &n);
if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
}
else
n = node->char_len;
r = add_length(reg, n);
if (r) return r;
r = compile_tree(node->target, reg);
}
break;
case ANCHOR_LOOK_BEHIND_NOT:
{
int n;
len = compile_length_tree(node->target, reg);
r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
if (r) return r;
if (node->char_len < 0) {
r = get_char_length_tree(node->target, reg, &n);
if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
}
else
n = node->char_len;
r = add_length(reg, n);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
compile_length_tree(Node* node, regex_t* reg)
{
int len, type, r;
type = NTYPE(node);
switch (type) {
case NT_LIST:
len = 0;
do {
r = compile_length_tree(NCAR(node), reg);
if (r < 0) return r;
len += r;
} while (IS_NOT_NULL(node = NCDR(node)));
r = len;
break;
case NT_ALT:
{
int n = 0;
len = 0;
do {
r = compile_length_tree(NCAR(node), reg);
if (r < 0) return r;
len += r;
n++;
} while (IS_NOT_NULL(node = NCDR(node)));
r = len;
r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
}
break;
case NT_STR:
if (NSTRING_IS_RAW(node))
r = compile_length_string_raw_node(NSTR(node), reg);
else
r = compile_length_string_node(node, reg);
break;
case NT_CCLASS:
r = compile_length_cclass_node(NCCLASS(node), reg);
break;
case NT_CTYPE:
case NT_CANY:
r = SIZE_OPCODE;
break;
case NT_BREF:
{
BRefNode* br = NBREF(node);
#ifdef USE_BACKREF_WITH_LEVEL
if (IS_BACKREF_NEST_LEVEL(br)) {
r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
}
else
#endif
if (br->back_num == 1) {
r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
}
else {
r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
}
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
r = SIZE_OP_CALL;
break;
#endif
case NT_QTFR:
r = compile_length_quantifier_node(NQTFR(node), reg);
break;
case NT_ENCLOSE:
r = compile_length_enclose_node(NENCLOSE(node), reg);
break;
case NT_ANCHOR:
r = compile_length_anchor_node(NANCHOR(node), reg);
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
compile_tree(Node* node, regex_t* reg)
{
int n, type, len, pos, r = 0;
type = NTYPE(node);
switch (type) {
case NT_LIST:
do {
r = compile_tree(NCAR(node), reg);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_ALT:
{
Node* x = node;
len = 0;
do {
len += compile_length_tree(NCAR(x), reg);
if (NCDR(x) != NULL) {
len += SIZE_OP_PUSH + SIZE_OP_JUMP;
}
} while (IS_NOT_NULL(x = NCDR(x)));
pos = reg->used + len; /* goal position */
do {
len = compile_length_tree(NCAR(node), reg);
if (IS_NOT_NULL(NCDR(node))) {
r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
if (r) break;
}
r = compile_tree(NCAR(node), reg);
if (r) break;
if (IS_NOT_NULL(NCDR(node))) {
len = pos - (reg->used + SIZE_OP_JUMP);
r = add_opcode_rel_addr(reg, OP_JUMP, len);
if (r) break;
}
} while (IS_NOT_NULL(node = NCDR(node)));
}
break;
case NT_STR:
if (NSTRING_IS_RAW(node))
r = compile_string_raw_node(NSTR(node), reg);
else
r = compile_string_node(node, reg);
break;
case NT_CCLASS:
r = compile_cclass_node(NCCLASS(node), reg);
break;
case NT_CTYPE:
{
int op;
switch (NCTYPE(node)->ctype) {
case ONIGENC_CTYPE_WORD:
if (NCTYPE(node)->ascii_range != 0) {
if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD;
else op = OP_ASCII_WORD;
}
else {
if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
else op = OP_WORD;
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
r = add_opcode(reg, op);
}
break;
case NT_CANY:
if (IS_MULTILINE(reg->options))
r = add_opcode(reg, OP_ANYCHAR_ML);
else
r = add_opcode(reg, OP_ANYCHAR);
break;
case NT_BREF:
{
BRefNode* br = NBREF(node);
#ifdef USE_BACKREF_WITH_LEVEL
if (IS_BACKREF_NEST_LEVEL(br)) {
r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
if (r) return r;
r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
if (r) return r;
r = add_length(reg, br->nest_level);
if (r) return r;
goto add_bacref_mems;
}
else
#endif
if (br->back_num == 1) {
n = br->back_static[0];
if (IS_IGNORECASE(reg->options)) {
r = add_opcode(reg, OP_BACKREFN_IC);
if (r) return r;
r = add_mem_num(reg, n);
}
else {
switch (n) {
case 1: r = add_opcode(reg, OP_BACKREF1); break;
case 2: r = add_opcode(reg, OP_BACKREF2); break;
default:
r = add_opcode(reg, OP_BACKREFN);
if (r) return r;
r = add_mem_num(reg, n);
break;
}
}
}
else {
int i;
int* p;
if (IS_IGNORECASE(reg->options)) {
r = add_opcode(reg, OP_BACKREF_MULTI_IC);
}
else {
r = add_opcode(reg, OP_BACKREF_MULTI);
}
if (r) return r;
#ifdef USE_BACKREF_WITH_LEVEL
add_bacref_mems:
#endif
r = add_length(reg, br->back_num);
if (r) return r;
p = BACKREFS_P(br);
for (i = br->back_num - 1; i >= 0; i--) {
r = add_mem_num(reg, p[i]);
if (r) return r;
}
}
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
r = compile_call(NCALL(node), reg);
break;
#endif
case NT_QTFR:
r = compile_quantifier_node(NQTFR(node), reg);
break;
case NT_ENCLOSE:
r = compile_enclose_node(NENCLOSE(node), reg);
break;
case NT_ANCHOR:
r = compile_anchor_node(NANCHOR(node), reg);
break;
default:
#ifdef ONIG_DEBUG
fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
#endif
break;
}
return r;
}
#ifdef USE_NAMED_GROUP
static int
noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
{
int r = 0;
Node* node = *plink;
switch (NTYPE(node)) {
case NT_LIST:
case NT_ALT:
do {
r = noname_disable_map(&(NCAR(node)), map, counter);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
{
Node** ptarget = &(NQTFR(node)->target);
Node* old = *ptarget;
r = noname_disable_map(ptarget, map, counter);
if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
onig_reduce_nested_quantifier(node, *ptarget);
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
if (en->type == ENCLOSE_MEMORY) {
if (IS_ENCLOSE_NAMED_GROUP(en)) {
(*counter)++;
map[en->regnum].new_val = *counter;
en->regnum = *counter;
}
else if (en->regnum != 0) {
*plink = en->target;
en->target = NULL_NODE;
onig_node_free(node);
r = noname_disable_map(plink, map, counter);
break;
}
}
r = noname_disable_map(&(en->target), map, counter);
}
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = noname_disable_map(&(an->target), map, counter);
break;
}
}
break;
default:
break;
}
return r;
}
static int
renumber_node_backref(Node* node, GroupNumRemap* map)
{
int i, pos, n, old_num;
int *backs;
BRefNode* bn = NBREF(node);
if (! IS_BACKREF_NAME_REF(bn))
return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
old_num = bn->back_num;
if (IS_NULL(bn->back_dynamic))
backs = bn->back_static;
else
backs = bn->back_dynamic;
for (i = 0, pos = 0; i < old_num; i++) {
n = map[backs[i]].new_val;
if (n > 0) {
backs[pos] = n;
pos++;
}
}
bn->back_num = pos;
return 0;
}
static int
renumber_by_map(Node* node, GroupNumRemap* map)
{
int r = 0;
switch (NTYPE(node)) {
case NT_LIST:
case NT_ALT:
do {
r = renumber_by_map(NCAR(node), map);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
r = renumber_by_map(NQTFR(node)->target, map);
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
if (en->type == ENCLOSE_CONDITION)
en->regnum = map[en->regnum].new_val;
r = renumber_by_map(en->target, map);
}
break;
case NT_BREF:
r = renumber_node_backref(node, map);
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = renumber_by_map(an->target, map);
break;
}
}
break;
default:
break;
}
return r;
}
static int
numbered_ref_check(Node* node)
{
int r = 0;
switch (NTYPE(node)) {
case NT_LIST:
case NT_ALT:
do {
r = numbered_ref_check(NCAR(node));
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
r = numbered_ref_check(NQTFR(node)->target);
break;
case NT_ENCLOSE:
r = numbered_ref_check(NENCLOSE(node)->target);
break;
case NT_BREF:
if (! IS_BACKREF_NAME_REF(NBREF(node)))
return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
break;
default:
break;
}
return r;
}
static int
disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
{
int r, i, pos, counter;
BitStatusType loc;
GroupNumRemap* map;
map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
CHECK_NULL_RETURN_MEMERR(map);
for (i = 1; i <= env->num_mem; i++) {
map[i].new_val = 0;
}
counter = 0;
r = noname_disable_map(root, map, &counter);
if (r != 0) return r;
r = renumber_by_map(*root, map);
if (r != 0) return r;
for (i = 1, pos = 1; i <= env->num_mem; i++) {
if (map[i].new_val > 0) {
SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
pos++;
}
}
loc = env->capture_history;
BIT_STATUS_CLEAR(env->capture_history);
for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
if (BIT_STATUS_AT(loc, i)) {
BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
}
}
env->num_mem = env->num_named;
reg->num_mem = env->num_named;
return onig_renumber_name_table(reg, map);
}
#endif /* USE_NAMED_GROUP */
#ifdef USE_SUBEXP_CALL
static int
unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
{
int i, offset;
EncloseNode* en;
AbsAddrType addr;
for (i = 0; i < uslist->num; i++) {
en = NENCLOSE(uslist->us[i].target);
if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
addr = en->call_addr;
offset = uslist->us[i].offset;
BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
}
return 0;
}
#endif
#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
static int
quantifiers_memory_node_info(Node* node)
{
int r = 0;
switch (NTYPE(node)) {
case NT_LIST:
case NT_ALT:
{
int v;
do {
v = quantifiers_memory_node_info(NCAR(node));
if (v > r) r = v;
} while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
if (IS_CALL_RECURSION(NCALL(node))) {
return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
}
else
r = quantifiers_memory_node_info(NCALL(node)->target);
break;
#endif
case NT_QTFR:
{
QtfrNode* qn = NQTFR(node);
if (qn->upper != 0) {
r = quantifiers_memory_node_info(qn->target);
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_MEMORY:
return NQ_TARGET_IS_EMPTY_MEM;
break;
case ENCLOSE_OPTION:
case ENCLOSE_STOP_BACKTRACK:
case ENCLOSE_CONDITION:
r = quantifiers_memory_node_info(en->target);
break;
default:
break;
}
}
break;
case NT_BREF:
case NT_STR:
case NT_CTYPE:
case NT_CCLASS:
case NT_CANY:
case NT_ANCHOR:
default:
break;
}
return r;
}
#endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
static int
get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
{
OnigDistance tmin;
int r = 0;
*min = 0;
switch (NTYPE(node)) {
case NT_BREF:
{
int i;
int* backs;
Node** nodes = SCANENV_MEM_NODES(env);
BRefNode* br = NBREF(node);
if (br->state & NST_RECURSION) break;
backs = BACKREFS_P(br);
if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
r = get_min_match_length(nodes[backs[0]], min, env);
if (r != 0) break;
for (i = 1; i < br->back_num; i++) {
if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
r = get_min_match_length(nodes[backs[i]], &tmin, env);
if (r != 0) break;
if (*min > tmin) *min = tmin;
}
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
if (IS_CALL_RECURSION(NCALL(node))) {
EncloseNode* en = NENCLOSE(NCALL(node)->target);
if (IS_ENCLOSE_MIN_FIXED(en))
*min = en->min_len;
}
else
r = get_min_match_length(NCALL(node)->target, min, env);
break;
#endif
case NT_LIST:
do {
r = get_min_match_length(NCAR(node), &tmin, env);
if (r == 0) *min += tmin;
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_ALT:
{
Node *x, *y;
y = node;
do {
x = NCAR(y);
r = get_min_match_length(x, &tmin, env);
if (r != 0) break;
if (y == node) *min = tmin;
else if (*min > tmin) *min = tmin;
} while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
}
break;
case NT_STR:
{
StrNode* sn = NSTR(node);
*min = sn->end - sn->s;
}
break;
case NT_CTYPE:
*min = 1;
break;
case NT_CCLASS:
case NT_CANY:
*min = 1;
break;
case NT_QTFR:
{
QtfrNode* qn = NQTFR(node);
if (qn->lower > 0) {
r = get_min_match_length(qn->target, min, env);
if (r == 0)
*min = distance_multiply(*min, qn->lower);
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_ENCLOSE_MIN_FIXED(en))
*min = en->min_len;
else {
r = get_min_match_length(en->target, min, env);
if (r == 0) {
en->min_len = *min;
SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
}
}
break;
#endif
case ENCLOSE_OPTION:
case ENCLOSE_STOP_BACKTRACK:
case ENCLOSE_CONDITION:
r = get_min_match_length(en->target, min, env);
break;
}
}
break;
case NT_ANCHOR:
default:
break;
}
return r;
}
static int
get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
{
OnigDistance tmax;
int r = 0;
*max = 0;
switch (NTYPE(node)) {
case NT_LIST:
do {
r = get_max_match_length(NCAR(node), &tmax, env);
if (r == 0)
*max = distance_add(*max, tmax);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_ALT:
do {
r = get_max_match_length(NCAR(node), &tmax, env);
if (r == 0 && *max < tmax) *max = tmax;
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_STR:
{
StrNode* sn = NSTR(node);
*max = sn->end - sn->s;
}
break;
case NT_CTYPE:
*max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
break;
case NT_CCLASS:
case NT_CANY:
*max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
break;
case NT_BREF:
{
int i;
int* backs;
Node** nodes = SCANENV_MEM_NODES(env);
BRefNode* br = NBREF(node);
if (br->state & NST_RECURSION) {
*max = ONIG_INFINITE_DISTANCE;
break;
}
backs = BACKREFS_P(br);
for (i = 0; i < br->back_num; i++) {
if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
r = get_max_match_length(nodes[backs[i]], &tmax, env);
if (r != 0) break;
if (*max < tmax) *max = tmax;
}
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
if (! IS_CALL_RECURSION(NCALL(node)))
r = get_max_match_length(NCALL(node)->target, max, env);
else
*max = ONIG_INFINITE_DISTANCE;
break;
#endif
case NT_QTFR:
{
QtfrNode* qn = NQTFR(node);
if (qn->upper != 0) {
r = get_max_match_length(qn->target, max, env);
if (r == 0 && *max != 0) {
if (! IS_REPEAT_INFINITE(qn->upper))
*max = distance_multiply(*max, qn->upper);
else
*max = ONIG_INFINITE_DISTANCE;
}
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_ENCLOSE_MAX_FIXED(en))
*max = en->max_len;
else {
r = get_max_match_length(en->target, max, env);
if (r == 0) {
en->max_len = *max;
SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
}
}
break;
#endif
case ENCLOSE_OPTION:
case ENCLOSE_STOP_BACKTRACK:
case ENCLOSE_CONDITION:
r = get_max_match_length(en->target, max, env);
break;
}
}
break;
case NT_ANCHOR:
default:
break;
}
return r;
}
#define GET_CHAR_LEN_VARLEN -1
#define GET_CHAR_LEN_TOP_ALT_VARLEN -2
/* fixed size pattern node only */
static int
get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
{
int tlen;
int r = 0;
level++;
*len = 0;
switch (NTYPE(node)) {
case NT_LIST:
do {
r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
if (r == 0)
*len = (int )distance_add(*len, tlen);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_ALT:
{
int tlen2;
int varlen = 0;
r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
if (r == 0) {
if (tlen != tlen2)
varlen = 1;
}
}
if (r == 0) {
if (varlen != 0) {
if (level == 1)
r = GET_CHAR_LEN_TOP_ALT_VARLEN;
else
r = GET_CHAR_LEN_VARLEN;
}
else
*len = tlen;
}
}
break;
case NT_STR:
{
StrNode* sn = NSTR(node);
UChar *s = sn->s;
while (s < sn->end) {
s += enclen(reg->enc, s, sn->end);
(*len)++;
}
}
break;
case NT_QTFR:
{
QtfrNode* qn = NQTFR(node);
if (qn->lower == qn->upper) {
r = get_char_length_tree1(qn->target, reg, &tlen, level);
if (r == 0)
*len = (int )distance_multiply(tlen, qn->lower);
}
else
r = GET_CHAR_LEN_VARLEN;
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
if (! IS_CALL_RECURSION(NCALL(node)))
r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
else
r = GET_CHAR_LEN_VARLEN;
break;
#endif
case NT_CTYPE:
*len = 1;
break;
case NT_CCLASS:
case NT_CANY:
*len = 1;
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_ENCLOSE_CLEN_FIXED(en))
*len = en->char_len;
else {
r = get_char_length_tree1(en->target, reg, len, level);
if (r == 0) {
en->char_len = *len;
SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
}
}
break;
#endif
case ENCLOSE_OPTION:
case ENCLOSE_STOP_BACKTRACK:
case ENCLOSE_CONDITION:
r = get_char_length_tree1(en->target, reg, len, level);
break;
default:
break;
}
}
break;
case NT_ANCHOR:
break;
default:
r = GET_CHAR_LEN_VARLEN;
break;
}
return r;
}
static int
get_char_length_tree(Node* node, regex_t* reg, int* len)
{
return get_char_length_tree1(node, reg, len, 0);
}
/* x is not included y ==> 1 : 0 */
static int
is_not_included(Node* x, Node* y, regex_t* reg)
{
int i;
OnigDistance len;
OnigCodePoint code;
UChar *p;
int ytype;
retry:
ytype = NTYPE(y);
switch (NTYPE(x)) {
case NT_CTYPE:
{
switch (ytype) {
case NT_CTYPE:
if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
NCTYPE(y)->not != NCTYPE(x)->not &&
NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range)
return 1;
else
return 0;
break;
case NT_CCLASS:
swap:
{
Node* tmp;
tmp = x; x = y; y = tmp;
goto retry;
}
break;
case NT_STR:
goto swap;
break;
default:
break;
}
}
break;
case NT_CCLASS:
{
CClassNode* xc = NCCLASS(x);
switch (ytype) {
case NT_CTYPE:
switch (NCTYPE(y)->ctype) {
case ONIGENC_CTYPE_WORD:
if (NCTYPE(y)->not == 0) {
if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
if (BITSET_AT(xc->bs, i)) {
if (NCTYPE(y)->ascii_range) {
if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
}
else {
if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
}
}
}
return 1;
}
return 0;
}
else {
if (IS_NOT_NULL(xc->mbuf)) return 0;
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
int is_word;
if (NCTYPE(y)->ascii_range)
is_word = IS_CODE_SB_WORD(reg->enc, i);
else
is_word = ONIGENC_IS_CODE_WORD(reg->enc, i);
if (! is_word) {
if (!IS_NCCLASS_NOT(xc)) {
if (BITSET_AT(xc->bs, i))
return 0;
}
else {
if (! BITSET_AT(xc->bs, i))
return 0;
}
}
}
return 1;
}
break;
default:
break;
}
break;
case NT_CCLASS:
{
int v;
CClassNode* yc = NCCLASS(y);
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
v = BITSET_AT(xc->bs, i);
if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
(v == 0 && IS_NCCLASS_NOT(xc))) {
v = BITSET_AT(yc->bs, i);
if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
(v == 0 && IS_NCCLASS_NOT(yc)))
return 0;
}
}
if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
(IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
return 1;
return 0;
}
break;
case NT_STR:
goto swap;
break;
default:
break;
}
}
break;
case NT_STR:
{
StrNode* xs = NSTR(x);
if (NSTRING_LEN(x) == 0)
break;
switch (ytype) {
case NT_CTYPE:
switch (NCTYPE(y)->ctype) {
case ONIGENC_CTYPE_WORD:
if (NCTYPE(y)->ascii_range) {
if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end))
return NCTYPE(y)->not;
else
return !(NCTYPE(y)->not);
}
else {
if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
return NCTYPE(y)->not;
else
return !(NCTYPE(y)->not);
}
break;
default:
break;
}
break;
case NT_CCLASS:
{
CClassNode* cc = NCCLASS(y);
code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
}
break;
case NT_STR:
{
UChar *q;
StrNode* ys = NSTR(y);
len = NSTRING_LEN(x);
if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
/* tiny version */
return 0;
}
else {
for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) {
if (*p != *q) return 1;
}
}
}
break;
default:
break;
}
}
break;
default:
break;
}
return 0;
}
static Node*
get_head_value_node(Node* node, int exact, regex_t* reg)
{
Node* n = NULL_NODE;
switch (NTYPE(node)) {
case NT_BREF:
case NT_ALT:
case NT_CANY:
#ifdef USE_SUBEXP_CALL
case NT_CALL:
#endif
break;
case NT_CTYPE:
case NT_CCLASS:
if (exact == 0) {
n = node;
}
break;
case NT_LIST:
n = get_head_value_node(NCAR(node), exact, reg);
break;
case NT_STR:
{
StrNode* sn = NSTR(node);
if (sn->end <= sn->s)
break;
if (exact != 0 &&
!NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
}
else {
n = node;
}
}
break;
case NT_QTFR:
{
QtfrNode* qn = NQTFR(node);
if (qn->lower > 0) {
if (IS_NOT_NULL(qn->head_exact))
n = qn->head_exact;
else
n = get_head_value_node(qn->target, exact, reg);
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_OPTION:
{
OnigOptionType options = reg->options;
reg->options = NENCLOSE(node)->option;
n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
reg->options = options;
}
break;
case ENCLOSE_MEMORY:
case ENCLOSE_STOP_BACKTRACK:
case ENCLOSE_CONDITION:
n = get_head_value_node(en->target, exact, reg);
break;
}
}
break;
case NT_ANCHOR:
if (NANCHOR(node)->type == ANCHOR_PREC_READ)
n = get_head_value_node(NANCHOR(node)->target, exact, reg);
break;
default:
break;
}
return n;
}
static int
check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
{
int type, r = 0;
type = NTYPE(node);
if ((NTYPE2BIT(type) & type_mask) == 0)
return 1;
switch (type) {
case NT_LIST:
case NT_ALT:
do {
r = check_type_tree(NCAR(node), type_mask, enclose_mask,
anchor_mask);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
anchor_mask);
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
if ((en->type & enclose_mask) == 0)
return 1;
r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
}
break;
case NT_ANCHOR:
type = NANCHOR(node)->type;
if ((type & anchor_mask) == 0)
return 1;
if (NANCHOR(node)->target)
r = check_type_tree(NANCHOR(node)->target,
type_mask, enclose_mask, anchor_mask);
break;
default:
break;
}
return r;
}
#ifdef USE_SUBEXP_CALL
#define RECURSION_EXIST 1
#define RECURSION_INFINITE 2
static int
subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
{
int type;
int r = 0;
type = NTYPE(node);
switch (type) {
case NT_LIST:
{
Node *x;
OnigDistance min;
int ret;
x = node;
do {
ret = subexp_inf_recursive_check(NCAR(x), env, head);
if (ret < 0 || ret == RECURSION_INFINITE) return ret;
r |= ret;
if (head) {
ret = get_min_match_length(NCAR(x), &min, env);
if (ret != 0) return ret;
if (min != 0) head = 0;
}
} while (IS_NOT_NULL(x = NCDR(x)));
}
break;
case NT_ALT:
{
int ret;
r = RECURSION_EXIST;
do {
ret = subexp_inf_recursive_check(NCAR(node), env, head);
if (ret < 0 || ret == RECURSION_INFINITE) return ret;
r &= ret;
} while (IS_NOT_NULL(node = NCDR(node)));
}
break;
case NT_QTFR:
r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
if (r == RECURSION_EXIST) {
if (NQTFR(node)->lower == 0) r = 0;
}
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = subexp_inf_recursive_check(an->target, env, head);
break;
}
}
break;
case NT_CALL:
r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
break;
case NT_ENCLOSE:
if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
return 0;
else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
else {
SET_ENCLOSE_STATUS(node, NST_MARK2);
r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
}
break;
default:
break;
}
return r;
}
static int
subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
{
int type;
int r = 0;
type = NTYPE(node);
switch (type) {
case NT_LIST:
case NT_ALT:
do {
r = subexp_inf_recursive_check_trav(NCAR(node), env);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = subexp_inf_recursive_check_trav(an->target, env);
break;
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
if (IS_ENCLOSE_RECURSION(en)) {
SET_ENCLOSE_STATUS(node, NST_MARK1);
r = subexp_inf_recursive_check(en->target, env, 1);
if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
}
r = subexp_inf_recursive_check_trav(en->target, env);
}
break;
default:
break;
}
return r;
}
static int
subexp_recursive_check(Node* node)
{
int r = 0;
switch (NTYPE(node)) {
case NT_LIST:
case NT_ALT:
do {
r |= subexp_recursive_check(NCAR(node));
} while (IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
r = subexp_recursive_check(NQTFR(node)->target);
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = subexp_recursive_check(an->target);
break;
}
}
break;
case NT_CALL:
r = subexp_recursive_check(NCALL(node)->target);
if (r != 0) SET_CALL_RECURSION(node);
break;
case NT_ENCLOSE:
if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
return 0;
else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
return 1; /* recursion */
else {
SET_ENCLOSE_STATUS(node, NST_MARK2);
r = subexp_recursive_check(NENCLOSE(node)->target);
CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
}
break;
default:
break;
}
return r;
}
static int
subexp_recursive_check_trav(Node* node, ScanEnv* env)
{
#define FOUND_CALLED_NODE 1
int type;
int r = 0;
type = NTYPE(node);
switch (type) {
case NT_LIST:
case NT_ALT:
{
int ret;
do {
ret = subexp_recursive_check_trav(NCAR(node), env);
if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
else if (ret < 0) return ret;
} while (IS_NOT_NULL(node = NCDR(node)));
}
break;
case NT_QTFR:
r = subexp_recursive_check_trav(NQTFR(node)->target, env);
if (NQTFR(node)->upper == 0) {
if (r == FOUND_CALLED_NODE)
NQTFR(node)->is_refered = 1;
}
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = subexp_recursive_check_trav(an->target, env);
break;
}
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
if (! IS_ENCLOSE_RECURSION(en)) {
if (IS_ENCLOSE_CALLED(en)) {
SET_ENCLOSE_STATUS(node, NST_MARK1);
r = subexp_recursive_check(en->target);
if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
}
}
r = subexp_recursive_check_trav(en->target, env);
if (IS_ENCLOSE_CALLED(en))
r |= FOUND_CALLED_NODE;
}
break;
default:
break;
}
return r;
}
static int
setup_subexp_call(Node* node, ScanEnv* env)
{
int type;
int r = 0;
type = NTYPE(node);
switch (type) {
case NT_LIST:
do {
r = setup_subexp_call(NCAR(node), env);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_ALT:
do {
r = setup_subexp_call(NCAR(node), env);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_QTFR:
r = setup_subexp_call(NQTFR(node)->target, env);
break;
case NT_ENCLOSE:
r = setup_subexp_call(NENCLOSE(node)->target, env);
break;
case NT_CALL:
{
CallNode* cn = NCALL(node);
Node** nodes = SCANENV_MEM_NODES(env);
if (cn->group_num != 0) {
int gnum = cn->group_num;
#ifdef USE_NAMED_GROUP
if (env->num_named > 0 &&
IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
!ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
}
#endif
if (gnum > env->num_mem) {
onig_scan_env_set_error_string(env,
ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
return ONIGERR_UNDEFINED_GROUP_REFERENCE;
}
#ifdef USE_NAMED_GROUP
set_call_attr:
#endif
cn->target = nodes[cn->group_num];
if (IS_NULL(cn->target)) {
onig_scan_env_set_error_string(env,
ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
return ONIGERR_UNDEFINED_NAME_REFERENCE;
}
SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
cn->unset_addr_list = env->unset_addr_list;
}
#ifdef USE_NAMED_GROUP
#ifdef USE_PERL_SUBEXP_CALL
else if (cn->name == cn->name_end) {
goto set_call_attr;
}
#endif
else {
int *refs;
int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
&refs);
if (n <= 0) {
onig_scan_env_set_error_string(env,
ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
return ONIGERR_UNDEFINED_NAME_REFERENCE;
}
else if (n > 1 &&
! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) {
onig_scan_env_set_error_string(env,
ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
}
else {
cn->group_num = refs[0];
goto set_call_attr;
}
}
#endif
}
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
case ANCHOR_PREC_READ_NOT:
case ANCHOR_LOOK_BEHIND:
case ANCHOR_LOOK_BEHIND_NOT:
r = setup_subexp_call(an->target, env);
break;
}
}
break;
default:
break;
}
return r;
}
#endif
/* divide different length alternatives in look-behind.
(?<=A|B) ==> (?<=A)|(?<=B)
(?<!A|B) ==> (?<!A)(?<!B)
*/
static int
divide_look_behind_alternatives(Node* node)
{
Node *head, *np, *insert_node;
AnchorNode* an = NANCHOR(node);
int anc_type = an->type;
head = an->target;
np = NCAR(head);
swap_node(node, head);
NCAR(node) = head;
NANCHOR(head)->target = np;
np = node;
while ((np = NCDR(np)) != NULL_NODE) {
insert_node = onig_node_new_anchor(anc_type);
CHECK_NULL_RETURN_MEMERR(insert_node);
NANCHOR(insert_node)->target = NCAR(np);
NCAR(np) = insert_node;
}
if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
np = node;
do {
SET_NTYPE(np, NT_LIST); /* alt -> list */
} while ((np = NCDR(np)) != NULL_NODE);
}
return 0;
}
static int
setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
{
int r, len;
AnchorNode* an = NANCHOR(node);
r = get_char_length_tree(an->target, reg, &len);
if (r == 0)
an->char_len = len;
else if (r == GET_CHAR_LEN_VARLEN)
r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
r = divide_look_behind_alternatives(node);
else
r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
}
return r;
}
static int
next_setup(Node* node, Node* next_node, int in_root, regex_t* reg)
{
int type;
retry:
type = NTYPE(node);
if (type == NT_QTFR) {
QtfrNode* qn = NQTFR(node);
if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
#ifdef USE_QTFR_PEEK_NEXT
Node* n = get_head_value_node(next_node, 1, reg);
/* '\0': for UTF-16BE etc... */
if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
qn->next_head_exact = n;
}
#endif
/* automatic possessification a*b ==> (?>a*)b */
if (qn->lower <= 1) {
int ttype = NTYPE(qn->target);
if (IS_NODE_TYPE_SIMPLE(ttype)) {
Node *x, *y;
x = get_head_value_node(qn->target, 0, reg);
if (IS_NOT_NULL(x)) {
y = get_head_value_node(next_node, 0, reg);
if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
CHECK_NULL_RETURN_MEMERR(en);
SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
swap_node(node, en);
NENCLOSE(node)->target = en;
}
}
}
}
#ifndef ONIG_DONT_OPTIMIZE
if (NTYPE(node) == NT_QTFR && /* the type may be changed by above block */
in_root && /* qn->lower == 0 && */
NTYPE(qn->target) == NT_CANY &&
! IS_MULTILINE(reg->options)) {
/* implicit anchor: /.*a/ ==> /(?:^|\G).*a/ */
Node *np;
np = onig_node_new_list(NULL_NODE, NULL_NODE);
CHECK_NULL_RETURN_MEMERR(np);
swap_node(node, np);
NCDR(node) = onig_node_new_list(np, NULL_NODE);
if (IS_NULL(NCDR(node))) {
onig_node_free(np);
return ONIGERR_MEMORY;
}
np = onig_node_new_anchor(ANCHOR_ANYCHAR_STAR); /* (?:^|\G) */
CHECK_NULL_RETURN_MEMERR(np);
NCAR(node) = np;
}
#endif
}
}
else if (type == NT_ENCLOSE) {
EncloseNode* en = NENCLOSE(node);
in_root = 0;
if (en->type == ENCLOSE_MEMORY) {
node = en->target;
goto retry;
}
}
return 0;
}
static int
update_string_node_case_fold(regex_t* reg, Node *node)
{
UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
UChar *sbuf, *ebuf, *sp;
int r, i, len;
OnigDistance sbuf_size;
StrNode* sn = NSTR(node);
end = sn->end;
sbuf_size = (end - sn->s) * 2;
sbuf = (UChar* )xmalloc(sbuf_size);
CHECK_NULL_RETURN_MEMERR(sbuf);
ebuf = sbuf + sbuf_size;
sp = sbuf;
p = sn->s;
while (p < end) {
len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
for (i = 0; i < len; i++) {
if (sp >= ebuf) {
UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2);
if (IS_NULL(p)) {
xfree(sbuf);
return ONIGERR_MEMORY;
}
sbuf = p;
sp = sbuf + sbuf_size;
sbuf_size *= 2;
ebuf = sbuf + sbuf_size;
}
*sp++ = buf[i];
}
}
r = onig_node_str_set(node, sbuf, sp);
if (r != 0) {
xfree(sbuf);
return r;
}
xfree(sbuf);
return 0;
}
static int
expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
regex_t* reg)
{
int r;
Node *node;
node = onig_node_new_str(s, end);
if (IS_NULL(node)) return ONIGERR_MEMORY;
r = update_string_node_case_fold(reg, node);
if (r != 0) {
onig_node_free(node);
return r;
}
NSTRING_SET_AMBIG(node);
NSTRING_SET_DONT_GET_OPT_INFO(node);
*rnode = node;
return 0;
}
static int
is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[],
int slen)
{
int i;
for (i = 0; i < item_num; i++) {
if (items[i].byte_len != slen) {
return 1;
}
if (items[i].code_len != 1) {
return 1;
}
}
return 0;
}
static int
expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
UChar *p, int slen, UChar *end,
regex_t* reg, Node **rnode)
{
int r, i, j, len, varlen;
Node *anode, *var_anode, *snode, *xnode, *an;
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
*rnode = var_anode = NULL_NODE;
varlen = 0;
for (i = 0; i < item_num; i++) {
if (items[i].byte_len != slen) {
varlen = 1;
break;
}
}
if (varlen != 0) {
*rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
xnode = onig_node_new_list(NULL, NULL);
if (IS_NULL(xnode)) goto mem_err;
NCAR(var_anode) = xnode;
anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
if (IS_NULL(anode)) goto mem_err;
NCAR(xnode) = anode;
}
else {
*rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
if (IS_NULL(anode)) return ONIGERR_MEMORY;
}
snode = onig_node_new_str(p, p + slen);
if (IS_NULL(snode)) goto mem_err;
NCAR(anode) = snode;
for (i = 0; i < item_num; i++) {
snode = onig_node_new_str(NULL, NULL);
if (IS_NULL(snode)) goto mem_err;
for (j = 0; j < items[i].code_len; j++) {
len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
if (len < 0) {
r = len;
goto mem_err2;
}
r = onig_node_str_cat(snode, buf, buf + len);
if (r != 0) goto mem_err2;
}
an = onig_node_new_alt(NULL_NODE, NULL_NODE);
if (IS_NULL(an)) {
goto mem_err2;
}
if (items[i].byte_len != slen) {
Node *rem;
UChar *q = p + items[i].byte_len;
if (q < end) {
r = expand_case_fold_make_rem_string(&rem, q, end, reg);
if (r != 0) {
onig_node_free(an);
goto mem_err2;
}
xnode = onig_node_list_add(NULL_NODE, snode);
if (IS_NULL(xnode)) {
onig_node_free(an);
onig_node_free(rem);
goto mem_err2;
}
if (IS_NULL(onig_node_list_add(xnode, rem))) {
onig_node_free(an);
onig_node_free(xnode);
onig_node_free(rem);
goto mem_err;
}
NCAR(an) = xnode;
}
else {
NCAR(an) = snode;
}
NCDR(var_anode) = an;
var_anode = an;
}
else {
NCAR(an) = snode;
NCDR(anode) = an;
anode = an;
}
}
return varlen;
mem_err2:
onig_node_free(snode);
mem_err:
onig_node_free(*rnode);
return ONIGERR_MEMORY;
}
static int
expand_case_fold_string(Node* node, regex_t* reg)
{
#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
int r, n, len, alt_num;
int varlen = 0;
UChar *start, *end, *p;
Node *top_root, *root, *snode, *prev_node;
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
StrNode* sn = NSTR(node);
if (NSTRING_IS_AMBIG(node)) return 0;
start = sn->s;
end = sn->end;
if (start >= end) return 0;
r = 0;
top_root = root = prev_node = snode = NULL_NODE;
alt_num = 1;
p = start;
while (p < end) {
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
p, end, items);
if (n < 0) {
r = n;
goto err;
}
len = enclen(reg->enc, p, end);
varlen = is_case_fold_variable_len(n, items, len);
if (n == 0 || varlen == 0) {
if (IS_NULL(snode)) {
if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
top_root = root = onig_node_list_add(NULL_NODE, prev_node);
if (IS_NULL(root)) {
onig_node_free(prev_node);
goto mem_err;
}
}
prev_node = snode = onig_node_new_str(NULL, NULL);
if (IS_NULL(snode)) goto mem_err;
if (IS_NOT_NULL(root)) {
if (IS_NULL(onig_node_list_add(root, snode))) {
onig_node_free(snode);
goto mem_err;
}
}
}
r = onig_node_str_cat(snode, p, p + len);
if (r != 0) goto err;
}
else {
alt_num *= (n + 1);
if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
if (IS_NOT_NULL(snode)) {
r = update_string_node_case_fold(reg, snode);
if (r == 0) {
NSTRING_SET_AMBIG(snode);
}
}
if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
top_root = root = onig_node_list_add(NULL_NODE, prev_node);
if (IS_NULL(root)) {
onig_node_free(prev_node);
goto mem_err;
}
}
r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
if (r < 0) goto mem_err;
if (r == 1) {
if (IS_NULL(root)) {
top_root = prev_node;
}
else {
if (IS_NULL(onig_node_list_add(root, prev_node))) {
onig_node_free(prev_node);
goto mem_err;
}
}
root = NCAR(prev_node);
}
else { /* r == 0 */
if (IS_NOT_NULL(root)) {
if (IS_NULL(onig_node_list_add(root, prev_node))) {
onig_node_free(prev_node);
goto mem_err;
}
}
}
snode = NULL_NODE;
}
p += len;
}
if (IS_NOT_NULL(snode)) {
r = update_string_node_case_fold(reg, snode);
if (r == 0) {
NSTRING_SET_AMBIG(snode);
}
}
if (p < end) {
Node *srem;
r = expand_case_fold_make_rem_string(&srem, p, end, reg);
if (r != 0) goto mem_err;
if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
top_root = root = onig_node_list_add(NULL_NODE, prev_node);
if (IS_NULL(root)) {
onig_node_free(srem);
onig_node_free(prev_node);
goto mem_err;
}
}
if (IS_NULL(root)) {
prev_node = srem;
}
else {
if (IS_NULL(onig_node_list_add(root, srem))) {
onig_node_free(srem);
goto mem_err;
}
}
}
/* ending */
top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
swap_node(node, top_root);
onig_node_free(top_root);
return 0;
mem_err:
r = ONIGERR_MEMORY;
err:
onig_node_free(top_root);
return r;
}
#ifdef USE_COMBINATION_EXPLOSION_CHECK
#define CEC_THRES_NUM_BIG_REPEAT 512
#define CEC_INFINITE_NUM 0x7fffffff
#define CEC_IN_INFINITE_REPEAT (1<<0)
#define CEC_IN_FINITE_REPEAT (1<<1)
#define CEC_CONT_BIG_REPEAT (1<<2)
static int
setup_comb_exp_check(Node* node, int state, ScanEnv* env)
{
int type;
int r = state;
type = NTYPE(node);
switch (type) {
case NT_LIST:
{
Node* prev = NULL_NODE;
do {
r = setup_comb_exp_check(NCAR(node), r, env);
prev = NCAR(node);
} while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
}
break;
case NT_ALT:
{
int ret;
do {
ret = setup_comb_exp_check(NCAR(node), state, env);
r |= ret;
} while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
}
break;
case NT_QTFR:
{
int child_state = state;
int add_state = 0;
QtfrNode* qn = NQTFR(node);
Node* target = qn->target;
int var_num;
if (! IS_REPEAT_INFINITE(qn->upper)) {
if (qn->upper > 1) {
/* {0,1}, {1,1} are allowed */
child_state |= CEC_IN_FINITE_REPEAT;
/* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
if (env->backrefed_mem == 0) {
if (NTYPE(qn->target) == NT_ENCLOSE) {
EncloseNode* en = NENCLOSE(qn->target);
if (en->type == ENCLOSE_MEMORY) {
if (NTYPE(en->target) == NT_QTFR) {
QtfrNode* q = NQTFR(en->target);
if (IS_REPEAT_INFINITE(q->upper)
&& q->greedy == qn->greedy) {
qn->upper = (qn->lower == 0 ? 1 : qn->lower);
if (qn->upper == 1)
child_state = state;
}
}
}
}
}
}
}
if (state & CEC_IN_FINITE_REPEAT) {
qn->comb_exp_check_num = -1;
}
else {
if (IS_REPEAT_INFINITE(qn->upper)) {
var_num = CEC_INFINITE_NUM;
child_state |= CEC_IN_INFINITE_REPEAT;
}
else {
var_num = qn->upper - qn->lower;
}
if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
add_state |= CEC_CONT_BIG_REPEAT;
if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
((state & CEC_CONT_BIG_REPEAT) != 0 &&
var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
if (qn->comb_exp_check_num == 0) {
env->num_comb_exp_check++;
qn->comb_exp_check_num = env->num_comb_exp_check;
if (env->curr_max_regnum > env->comb_exp_max_regnum)
env->comb_exp_max_regnum = env->curr_max_regnum;
}
}
}
r = setup_comb_exp_check(target, child_state, env);
r |= add_state;
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_MEMORY:
{
if (env->curr_max_regnum < en->regnum)
env->curr_max_regnum = en->regnum;
r = setup_comb_exp_check(en->target, state, env);
}
break;
default:
r = setup_comb_exp_check(en->target, state, env);
break;
}
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
if (IS_CALL_RECURSION(NCALL(node)))
env->has_recursion = 1;
else
r = setup_comb_exp_check(NCALL(node)->target, state, env);
break;
#endif
default:
break;
}
return r;
}
#endif
#define IN_ALT (1<<0)
#define IN_NOT (1<<1)
#define IN_REPEAT (1<<2)
#define IN_VAR_REPEAT (1<<3)
#define IN_ROOT (1<<4)
/* setup_tree does the following work.
1. check empty loop. (set qn->target_empty_info)
2. expand ignore-case in char class.
3. set memory status bit flags. (reg->mem_stats)
4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
5. find invalid patterns in look-behind.
6. expand repeated string.
*/
static int
setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
{
int type;
int r = 0;
int in_root = state & IN_ROOT;
state &= ~IN_ROOT;
restart:
type = NTYPE(node);
switch (type) {
case NT_LIST:
{
Node* prev = NULL_NODE;
int prev_in_root = 0;
state |= in_root;
do {
r = setup_tree(NCAR(node), reg, state, env);
if (IS_NOT_NULL(prev) && r == 0) {
r = next_setup(prev, NCAR(node), prev_in_root, reg);
}
prev = NCAR(node);
prev_in_root = state & IN_ROOT;
state &= ~IN_ROOT;
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
}
break;
case NT_ALT:
do {
r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
break;
case NT_CCLASS:
break;
case NT_STR:
if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
r = expand_case_fold_string(node, reg);
}
break;
case NT_CTYPE:
case NT_CANY:
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
break;
#endif
case NT_BREF:
{
int i;
int* p;
Node** nodes = SCANENV_MEM_NODES(env);
BRefNode* br = NBREF(node);
p = BACKREFS_P(br);
for (i = 0; i < br->back_num; i++) {
if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
#ifdef USE_BACKREF_WITH_LEVEL
if (IS_BACKREF_NEST_LEVEL(br)) {
BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
}
#endif
SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
}
}
break;
case NT_QTFR:
{
OnigDistance d;
QtfrNode* qn = NQTFR(node);
Node* target = qn->target;
if ((state & IN_REPEAT) != 0) {
qn->state |= NST_IN_REPEAT;
}
if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
r = get_min_match_length(target, &d, env);
if (r) break;
if (d == 0) {
qn->target_empty_info = NQ_TARGET_IS_EMPTY;
#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
r = quantifiers_memory_node_info(target);
if (r < 0) break;
if (r > 0) {
qn->target_empty_info = r;
}
#endif
#if 0
r = get_max_match_length(target, &d, env);
if (r == 0 && d == 0) {
/* ()* ==> ()?, ()+ ==> () */
qn->upper = 1;
if (qn->lower > 1) qn->lower = 1;
if (NTYPE(target) == NT_STR) {
qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
}
}
#endif
}
}
state |= IN_REPEAT;
if (qn->lower != qn->upper)
state |= IN_VAR_REPEAT;
r = setup_tree(target, reg, state, env);
if (r) break;
/* expand string */
#define EXPAND_STRING_MAX_LENGTH 100
if (NTYPE(target) == NT_STR) {
if (qn->lower > 1) {
int i, n = qn->lower;
OnigDistance len = NSTRING_LEN(target);
StrNode* sn = NSTR(target);
Node* np;
np = onig_node_new_str(sn->s, sn->end);
if (IS_NULL(np)) return ONIGERR_MEMORY;
NSTR(np)->flag = sn->flag;
for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) {
r = onig_node_str_cat(np, sn->s, sn->end);
if (r) {
onig_node_free(np);
return r;
}
}
if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) {
Node *np1, *np2;
qn->lower -= i;
if (! IS_REPEAT_INFINITE(qn->upper))
qn->upper -= i;
np1 = onig_node_new_list(np, NULL);
if (IS_NULL(np1)) {
onig_node_free(np);
return ONIGERR_MEMORY;
}
swap_node(np1, node);
np2 = onig_node_list_add(node, np1);
if (IS_NULL(np2)) {
onig_node_free(np1);
return ONIGERR_MEMORY;
}
}
else {
swap_node(np, node);
onig_node_free(np);
}
break; /* break case NT_QTFR: */
}
}
#ifdef USE_OP_PUSH_OR_JUMP_EXACT
if (qn->greedy && (qn->target_empty_info != 0)) {
if (NTYPE(target) == NT_QTFR) {
QtfrNode* tqn = NQTFR(target);
if (IS_NOT_NULL(tqn->head_exact)) {
qn->head_exact = tqn->head_exact;
tqn->head_exact = NULL;
}
}
else {
qn->head_exact = get_head_value_node(qn->target, 1, reg);
}
}
#endif
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_OPTION:
{
OnigOptionType options = reg->options;
state |= in_root;
reg->options = NENCLOSE(node)->option;
r = setup_tree(NENCLOSE(node)->target, reg, state, env);
reg->options = options;
}
break;
case ENCLOSE_MEMORY:
if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
/* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
}
r = setup_tree(en->target, reg, state, env);
break;
case ENCLOSE_STOP_BACKTRACK:
{
Node* target = en->target;
r = setup_tree(target, reg, state, env);
if (NTYPE(target) == NT_QTFR) {
QtfrNode* tqn = NQTFR(target);
if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
tqn->greedy != 0) { /* (?>a*), a*+ etc... */
int qtype = NTYPE(tqn->target);
if (IS_NODE_TYPE_SIMPLE(qtype))
SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
}
}
}
break;
case ENCLOSE_CONDITION:
#ifdef USE_NAMED_GROUP
if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) &&
env->num_named > 0 &&
IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
!ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
}
#endif
r = setup_tree(NENCLOSE(node)->target, reg, state, env);
break;
}
}
break;
case NT_ANCHOR:
{
AnchorNode* an = NANCHOR(node);
switch (an->type) {
case ANCHOR_PREC_READ:
r = setup_tree(an->target, reg, state, env);
break;
case ANCHOR_PREC_READ_NOT:
r = setup_tree(an->target, reg, (state | IN_NOT), env);
break;
/* allowed node types in look-behind */
#define ALLOWED_TYPE_IN_LB \
( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
#define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
#define ALLOWED_ANCHOR_IN_LB \
( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
#define ALLOWED_ANCHOR_IN_LB_NOT \
( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
case ANCHOR_LOOK_BEHIND:
{
r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
r = setup_look_behind(node, reg, env);
if (r != 0) return r;
if (NTYPE(node) != NT_ANCHOR) goto restart;
r = setup_tree(an->target, reg, state, env);
}
break;
case ANCHOR_LOOK_BEHIND_NOT:
{
r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
r = setup_look_behind(node, reg, env);
if (r != 0) return r;
if (NTYPE(node) != NT_ANCHOR) goto restart;
r = setup_tree(an->target, reg, (state | IN_NOT), env);
}
break;
}
}
break;
default:
break;
}
return r;
}
#ifndef USE_SUNDAY_QUICK_SEARCH
/* set skip map for Boyer-Moore search */
static int
set_bm_skip(UChar* s, UChar* end, regex_t* reg,
UChar skip[], int** int_skip, int ignore_case)
{
OnigDistance i, len;
int clen, flen, n, j, k;
UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
OnigEncoding enc = reg->enc;
len = end - s;
if (len < ONIG_CHAR_TABLE_SIZE) {
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len;
n = 0;
for (i = 0; i < len - 1; i += clen) {
p = s + i;
if (ignore_case)
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
p, end, items);
clen = enclen(enc, p, end);
for (j = 0; j < n; j++) {
if ((items[j].code_len != 1) || (items[j].byte_len != clen))
return 1; /* different length isn't supported. */
flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
if (flen != clen)
return 1; /* different length isn't supported. */
}
for (j = 0; j < clen; j++) {
skip[s[i + j]] = (UChar )(len - 1 - i - j);
for (k = 0; k < n; k++) {
skip[buf[k][j]] = (UChar )(len - 1 - i - j);
}
}
}
}
else {
if (IS_NULL(*int_skip)) {
*int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
}
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len;
n = 0;
for (i = 0; i < len - 1; i += clen) {
p = s + i;
if (ignore_case)
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
p, end, items);
clen = enclen(enc, p, end);
for (j = 0; j < n; j++) {
if ((items[j].code_len != 1) || (items[j].byte_len != clen))
return 1; /* different length isn't supported. */
flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
if (flen != clen)
return 1; /* different length isn't supported. */
}
for (j = 0; j < clen; j++) {
(*int_skip)[s[i + j]] = (int )(len - 1 - i - j);
for (k = 0; k < n; k++) {
(*int_skip)[buf[k][j]] = (int )(len - 1 - i - j);
}
}
}
}
return 0;
}
#else /* USE_SUNDAY_QUICK_SEARCH */
/* set skip map for Sunday's quick search */
static int
set_bm_skip(UChar* s, UChar* end, regex_t* reg,
UChar skip[], int** int_skip, int ignore_case)
{
OnigDistance i, len;
int clen, flen, n, j, k;
UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
OnigEncoding enc = reg->enc;
len = end - s;
if (len < ONIG_CHAR_TABLE_SIZE) {
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1);
n = 0;
for (i = 0; i < len; i += clen) {
p = s + i;
if (ignore_case)
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
p, end, items);
clen = enclen(enc, p, end);
for (j = 0; j < n; j++) {
if ((items[j].code_len != 1) || (items[j].byte_len != clen))
return 1; /* different length isn't supported. */
flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
if (flen != clen)
return 1; /* different length isn't supported. */
}
for (j = 0; j < clen; j++) {
skip[s[i + j]] = (UChar )(len - i - j);
for (k = 0; k < n; k++) {
skip[buf[k][j]] = (UChar )(len - i - j);
}
}
}
}
else {
if (IS_NULL(*int_skip)) {
*int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
}
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1);
n = 0;
for (i = 0; i < len; i += clen) {
p = s + i;
if (ignore_case)
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
p, end, items);
clen = enclen(enc, p, end);
for (j = 0; j < n; j++) {
if ((items[j].code_len != 1) || (items[j].byte_len != clen))
return 1; /* different length isn't supported. */
flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
if (flen != clen)
return 1; /* different length isn't supported. */
}
for (j = 0; j < clen; j++) {
(*int_skip)[s[i + j]] = (int )(len - i - j);
for (k = 0; k < n; k++) {
(*int_skip)[buf[k][j]] = (int )(len - i - j);
}
}
}
}
return 0;
}
#endif /* USE_SUNDAY_QUICK_SEARCH */
#define OPT_EXACT_MAXLEN 24
typedef struct {
OnigDistance min; /* min byte length */
OnigDistance max; /* max byte length */
} MinMaxLen;
typedef struct {
MinMaxLen mmd;
OnigEncoding enc;
OnigOptionType options;
OnigCaseFoldType case_fold_flag;
ScanEnv* scan_env;
} OptEnv;
typedef struct {
int left_anchor;
int right_anchor;
} OptAncInfo;
typedef struct {
MinMaxLen mmd; /* info position */
OptAncInfo anc;
int reach_end;
int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */
int len;
UChar s[OPT_EXACT_MAXLEN];
} OptExactInfo;
typedef struct {
MinMaxLen mmd; /* info position */
OptAncInfo anc;
int value; /* weighted value */
UChar map[ONIG_CHAR_TABLE_SIZE];
} OptMapInfo;
typedef struct {
MinMaxLen len;
OptAncInfo anc;
OptExactInfo exb; /* boundary */
OptExactInfo exm; /* middle */
OptExactInfo expr; /* prec read (?=...) */
OptMapInfo map; /* boundary */
} NodeOptInfo;
static int
map_position_value(OnigEncoding enc, int i)
{
static const short int ByteValTable[] = {
5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
};
if (i < numberof(ByteValTable)) {
if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
return 20;
else
return (int )ByteValTable[i];
}
else
return 4; /* Take it easy. */
}
static int
distance_value(MinMaxLen* mm)
{
/* 1000 / (min-max-dist + 1) */
static const short int dist_vals[] = {
1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
11, 11, 11, 11, 11, 10, 10, 10, 10, 10
};
OnigDistance d;
if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
d = mm->max - mm->min;
if (d < numberof(dist_vals))
/* return dist_vals[d] * 16 / (mm->min + 12); */
return (int )dist_vals[d];
else
return 1;
}
static int
comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
{
if (v2 <= 0) return -1;
if (v1 <= 0) return 1;
v1 *= distance_value(d1);
v2 *= distance_value(d2);
if (v2 > v1) return 1;
if (v2 < v1) return -1;
if (d2->min < d1->min) return 1;
if (d2->min > d1->min) return -1;
return 0;
}
static int
is_equal_mml(MinMaxLen* a, MinMaxLen* b)
{
return (a->min == b->min && a->max == b->max) ? 1 : 0;
}
static void
set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
{
mml->min = min;
mml->max = max;
}
static void
clear_mml(MinMaxLen* mml)
{
mml->min = mml->max = 0;
}
static void
copy_mml(MinMaxLen* to, MinMaxLen* from)
{
to->min = from->min;
to->max = from->max;
}
static void
add_mml(MinMaxLen* to, MinMaxLen* from)
{
to->min = distance_add(to->min, from->min);
to->max = distance_add(to->max, from->max);
}
#if 0
static void
add_len_mml(MinMaxLen* to, OnigDistance len)
{
to->min = distance_add(to->min, len);
to->max = distance_add(to->max, len);
}
#endif
static void
alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
{
if (to->min > from->min) to->min = from->min;
if (to->max < from->max) to->max = from->max;
}
static void
copy_opt_env(OptEnv* to, OptEnv* from)
{
*to = *from;
}
static void
clear_opt_anc_info(OptAncInfo* anc)
{
anc->left_anchor = 0;
anc->right_anchor = 0;
}
static void
copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
{
*to = *from;
}
static void
concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
OnigDistance left_len, OnigDistance right_len)
{
clear_opt_anc_info(to);
to->left_anchor = left->left_anchor;
if (left_len == 0) {
to->left_anchor |= right->left_anchor;
}
to->right_anchor = right->right_anchor;
if (right_len == 0) {
to->right_anchor |= left->right_anchor;
}
else {
to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
}
}
static int
is_left_anchor(int anc)
{
if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
anc == ANCHOR_PREC_READ_NOT)
return 0;
return 1;
}
static int
is_set_opt_anc_info(OptAncInfo* to, int anc)
{
if ((to->left_anchor & anc) != 0) return 1;
return ((to->right_anchor & anc) != 0 ? 1 : 0);
}
static void
add_opt_anc_info(OptAncInfo* to, int anc)
{
if (is_left_anchor(anc))
to->left_anchor |= anc;
else
to->right_anchor |= anc;
}
static void
remove_opt_anc_info(OptAncInfo* to, int anc)
{
if (is_left_anchor(anc))
to->left_anchor &= ~anc;
else
to->right_anchor &= ~anc;
}
static void
alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
{
to->left_anchor &= add->left_anchor;
to->right_anchor &= add->right_anchor;
}
static int
is_full_opt_exact_info(OptExactInfo* ex)
{
return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
}
static void
clear_opt_exact_info(OptExactInfo* ex)
{
clear_mml(&ex->mmd);
clear_opt_anc_info(&ex->anc);
ex->reach_end = 0;
ex->ignore_case = -1; /* unset */
ex->len = 0;
ex->s[0] = '\0';
}
static void
copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
{
*to = *from;
}
static void
concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
{
int i, j, len;
UChar *p, *end;
OptAncInfo tanc;
if (to->ignore_case < 0)
to->ignore_case = add->ignore_case;
else if (to->ignore_case != add->ignore_case)
return ; /* avoid */
p = add->s;
end = p + add->len;
for (i = to->len; p < end; ) {
len = enclen(enc, p, end);
if (i + len > OPT_EXACT_MAXLEN) break;
for (j = 0; j < len && p < end; j++)
to->s[i++] = *p++;
}
to->len = i;
to->reach_end = (p == end ? add->reach_end : 0);
concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
if (! to->reach_end) tanc.right_anchor = 0;
copy_opt_anc_info(&to->anc, &tanc);
}
static void
concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
int raw ARG_UNUSED, OnigEncoding enc)
{
int i, j, len;
UChar *p;
for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
len = enclen(enc, p, end);
if (i + len > OPT_EXACT_MAXLEN) break;
for (j = 0; j < len && p < end; j++)
to->s[i++] = *p++;
}
to->len = i;
}
static void
alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
{
int i, j, len;
if (add->len == 0 || to->len == 0) {
clear_opt_exact_info(to);
return ;
}
if (! is_equal_mml(&to->mmd, &add->mmd)) {
clear_opt_exact_info(to);
return ;
}
for (i = 0; i < to->len && i < add->len; ) {
if (to->s[i] != add->s[i]) break;
len = enclen(env->enc, to->s + i, to->s + to->len);
for (j = 1; j < len; j++) {
if (to->s[i+j] != add->s[i+j]) break;
}
if (j < len) break;
i += len;
}
if (! add->reach_end || i < add->len || i < to->len) {
to->reach_end = 0;
}
to->len = i;
if (to->ignore_case < 0)
to->ignore_case = add->ignore_case;
else if (add->ignore_case >= 0)
to->ignore_case |= add->ignore_case;
alt_merge_opt_anc_info(&to->anc, &add->anc);
if (! to->reach_end) to->anc.right_anchor = 0;
}
static void
select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
{
int v1, v2;
v1 = now->len;
v2 = alt->len;
if (v2 == 0) {
return ;
}
else if (v1 == 0) {
copy_opt_exact_info(now, alt);
return ;
}
else if (v1 <= 2 && v2 <= 2) {
/* ByteValTable[x] is big value --> low price */
v2 = map_position_value(enc, now->s[0]);
v1 = map_position_value(enc, alt->s[0]);
if (now->len > 1) v1 += 5;
if (alt->len > 1) v2 += 5;
}
if (now->ignore_case <= 0) v1 *= 2;
if (alt->ignore_case <= 0) v2 *= 2;
if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
copy_opt_exact_info(now, alt);
}
static void
clear_opt_map_info(OptMapInfo* map)
{
static const OptMapInfo clean_info = {
{0, 0}, {0, 0}, 0,
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
}
};
xmemcpy(map, &clean_info, sizeof(OptMapInfo));
}
static void
copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
{
*to = *from;
}
static void
add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
{
if (map->map[c] == 0) {
map->map[c] = 1;
map->value += map_position_value(enc, c);
}
}
static int
add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
OnigEncoding enc, OnigCaseFoldType case_fold_flag)
{
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
int i, n;
add_char_opt_map_info(map, p[0], enc);
case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
if (n < 0) return n;
for (i = 0; i < n; i++) {
ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
add_char_opt_map_info(map, buf[0], enc);
}
return 0;
}
static void
select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
{
const int z = 1<<15; /* 32768: something big value */
int v1, v2;
if (alt->value == 0) return ;
if (now->value == 0) {
copy_opt_map_info(now, alt);
return ;
}
v1 = z / now->value;
v2 = z / alt->value;
if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
copy_opt_map_info(now, alt);
}
static int
comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
{
#define COMP_EM_BASE 20
int ve, vm;
if (m->value <= 0) return -1;
ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2);
vm = COMP_EM_BASE * 5 * 2 / m->value;
return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
}
static void
alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
{
int i, val;
/* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
if (to->value == 0) return ;
if (add->value == 0 || to->mmd.max < add->mmd.min) {
clear_opt_map_info(to);
return ;
}
alt_merge_mml(&to->mmd, &add->mmd);
val = 0;
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
if (add->map[i])
to->map[i] = 1;
if (to->map[i])
val += map_position_value(enc, i);
}
to->value = val;
alt_merge_opt_anc_info(&to->anc, &add->anc);
}
static void
set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
{
copy_mml(&(opt->exb.mmd), mmd);
copy_mml(&(opt->expr.mmd), mmd);
copy_mml(&(opt->map.mmd), mmd);
}
static void
clear_node_opt_info(NodeOptInfo* opt)
{
clear_mml(&opt->len);
clear_opt_anc_info(&opt->anc);
clear_opt_exact_info(&opt->exb);
clear_opt_exact_info(&opt->exm);
clear_opt_exact_info(&opt->expr);
clear_opt_map_info(&opt->map);
}
static void
copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
{
*to = *from;
}
static void
concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
{
int exb_reach, exm_reach;
OptAncInfo tanc;
concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
copy_opt_anc_info(&to->anc, &tanc);
if (add->exb.len > 0 && to->len.max == 0) {
concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
to->len.max, add->len.max);
copy_opt_anc_info(&add->exb.anc, &tanc);
}
if (add->map.value > 0 && to->len.max == 0) {
if (add->map.mmd.max == 0)
add->map.anc.left_anchor |= to->anc.left_anchor;
}
exb_reach = to->exb.reach_end;
exm_reach = to->exm.reach_end;
if (add->len.max != 0)
to->exb.reach_end = to->exm.reach_end = 0;
if (add->exb.len > 0) {
if (exb_reach) {
concat_opt_exact_info(&to->exb, &add->exb, enc);
clear_opt_exact_info(&add->exb);
}
else if (exm_reach) {
concat_opt_exact_info(&to->exm, &add->exb, enc);
clear_opt_exact_info(&add->exb);
}
}
select_opt_exact_info(enc, &to->exm, &add->exb);
select_opt_exact_info(enc, &to->exm, &add->exm);
if (to->expr.len > 0) {
if (add->len.max > 0) {
if (to->expr.len > (int )add->len.max)
to->expr.len = (int )add->len.max;
if (to->expr.mmd.max == 0)
select_opt_exact_info(enc, &to->exb, &to->expr);
else
select_opt_exact_info(enc, &to->exm, &to->expr);
}
}
else if (add->expr.len > 0) {
copy_opt_exact_info(&to->expr, &add->expr);
}
select_opt_map_info(&to->map, &add->map);
add_mml(&to->len, &add->len);
}
static void
alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
{
alt_merge_opt_anc_info (&to->anc, &add->anc);
alt_merge_opt_exact_info(&to->exb, &add->exb, env);
alt_merge_opt_exact_info(&to->exm, &add->exm, env);
alt_merge_opt_exact_info(&to->expr, &add->expr, env);
alt_merge_opt_map_info(env->enc, &to->map, &add->map);
alt_merge_mml(&to->len, &add->len);
}
#define MAX_NODE_OPT_INFO_REF_COUNT 5
static int
optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
{
int type;
int r = 0;
clear_node_opt_info(opt);
set_bound_node_opt_info(opt, &env->mmd);
type = NTYPE(node);
switch (type) {
case NT_LIST:
{
OptEnv nenv;
NodeOptInfo nopt;
Node* nd = node;
copy_opt_env(&nenv, env);
do {
r = optimize_node_left(NCAR(nd), &nopt, &nenv);
if (r == 0) {
add_mml(&nenv.mmd, &nopt.len);
concat_left_node_opt_info(env->enc, opt, &nopt);
}
} while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
}
break;
case NT_ALT:
{
NodeOptInfo nopt;
Node* nd = node;
do {
r = optimize_node_left(NCAR(nd), &nopt, env);
if (r == 0) {
if (nd == node) copy_node_opt_info(opt, &nopt);
else alt_merge_node_opt_info(opt, &nopt, env);
}
} while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
}
break;
case NT_STR:
{
StrNode* sn = NSTR(node);
OnigDistance slen = sn->end - sn->s;
int is_raw = NSTRING_IS_RAW(node);
if (! NSTRING_IS_AMBIG(node)) {
concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
is_raw, env->enc);
opt->exb.ignore_case = 0;
if (slen > 0) {
add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
}
set_mml(&opt->len, slen, slen);
}
else {
OnigDistance max;
if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
int n = onigenc_strlen(env->enc, sn->s, sn->end);
max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
}
else {
concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
is_raw, env->enc);
opt->exb.ignore_case = 1;
if (slen > 0) {
r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
env->enc, env->case_fold_flag);
if (r != 0) break;
}
max = slen;
}
set_mml(&opt->len, slen, max);
}
if ((OnigDistance )opt->exb.len == slen)
opt->exb.reach_end = 1;
}
break;
case NT_CCLASS:
{
int i, z;
CClassNode* cc = NCCLASS(node);
/* no need to check ignore case. (set in setup_tree()) */
if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
set_mml(&opt->len, min, max);
}
else {
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
z = BITSET_AT(cc->bs, i);
if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
}
}
set_mml(&opt->len, 1, 1);
}
}
break;
case NT_CTYPE:
{
int i, min, max;
int maxcode;
max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
if (max == 1) {
min = 1;
maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
switch (NCTYPE(node)->ctype) {
case ONIGENC_CTYPE_WORD:
if (NCTYPE(node)->not != 0) {
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) {
add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
}
}
}
else {
for (i = 0; i < maxcode; i++) {
if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
}
}
}
break;
}
}
else {
min = ONIGENC_MBC_MINLEN(env->enc);
}
set_mml(&opt->len, min, max);
}
break;
case NT_CANY:
{
OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
set_mml(&opt->len, min, max);
}
break;
case NT_ANCHOR:
switch (NANCHOR(node)->type) {
case ANCHOR_BEGIN_BUF:
case ANCHOR_BEGIN_POSITION:
case ANCHOR_BEGIN_LINE:
case ANCHOR_END_BUF:
case ANCHOR_SEMI_END_BUF:
case ANCHOR_END_LINE:
case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */
case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */
add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
break;
case ANCHOR_PREC_READ:
{
NodeOptInfo nopt;
r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
if (r == 0) {
if (nopt.exb.len > 0)
copy_opt_exact_info(&opt->expr, &nopt.exb);
else if (nopt.exm.len > 0)
copy_opt_exact_info(&opt->expr, &nopt.exm);
opt->expr.reach_end = 0;
if (nopt.map.value > 0)
copy_opt_map_info(&opt->map, &nopt.map);
}
}
break;
case ANCHOR_LOOK_BEHIND_NOT:
break;
}
break;
case NT_BREF:
{
int i;
int* backs;
OnigDistance min, max, tmin, tmax;
Node** nodes = SCANENV_MEM_NODES(env->scan_env);
BRefNode* br = NBREF(node);
if (br->state & NST_RECURSION) {
set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
break;
}
backs = BACKREFS_P(br);
r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
if (r != 0) break;
r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
if (r != 0) break;
for (i = 1; i < br->back_num; i++) {
r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
if (r != 0) break;
r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
if (r != 0) break;
if (min > tmin) min = tmin;
if (max < tmax) max = tmax;
}
if (r == 0) set_mml(&opt->len, min, max);
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
if (IS_CALL_RECURSION(NCALL(node)))
set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
else {
OnigOptionType save = env->options;
env->options = NENCLOSE(NCALL(node)->target)->option;
r = optimize_node_left(NCALL(node)->target, opt, env);
env->options = save;
}
break;
#endif
case NT_QTFR:
{
int i;
OnigDistance min, max;
NodeOptInfo nopt;
QtfrNode* qn = NQTFR(node);
r = optimize_node_left(qn->target, &nopt, env);
if (r) break;
if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) {
if (env->mmd.max == 0 &&
NTYPE(qn->target) == NT_CANY && qn->greedy) {
if (IS_MULTILINE(env->options))
/* implicit anchor: /.*a/ ==> /\A.*a/ */
add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
else
add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
}
}
else {
if (qn->lower > 0) {
copy_node_opt_info(opt, &nopt);
if (nopt.exb.len > 0) {
if (nopt.exb.reach_end) {
for (i = 2; i <= qn->lower &&
! is_full_opt_exact_info(&opt->exb); i++) {
concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
}
if (i < qn->lower) {
opt->exb.reach_end = 0;
}
}
}
if (qn->lower != qn->upper) {
opt->exb.reach_end = 0;
opt->exm.reach_end = 0;
}
if (qn->lower > 1)
opt->exm.reach_end = 0;
}
}
min = distance_multiply(nopt.len.min, qn->lower);
if (IS_REPEAT_INFINITE(qn->upper))
max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
else
max = distance_multiply(nopt.len.max, qn->upper);
set_mml(&opt->len, min, max);
}
break;
case NT_ENCLOSE:
{
EncloseNode* en = NENCLOSE(node);
switch (en->type) {
case ENCLOSE_OPTION:
{
OnigOptionType save = env->options;
env->options = en->option;
r = optimize_node_left(en->target, opt, env);
env->options = save;
}
break;
case ENCLOSE_MEMORY:
#ifdef USE_SUBEXP_CALL
en->opt_count++;
if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
OnigDistance min, max;
min = 0;
max = ONIG_INFINITE_DISTANCE;
if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
set_mml(&opt->len, min, max);
}
else
#endif
{
r = optimize_node_left(en->target, opt, env);
if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
}
}
break;
case ENCLOSE_STOP_BACKTRACK:
case ENCLOSE_CONDITION:
r = optimize_node_left(en->target, opt, env);
break;
}
}
break;
default:
#ifdef ONIG_DEBUG
fprintf(stderr, "optimize_node_left: undefined node type %d\n",
NTYPE(node));
#endif
r = ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
{
int r;
int allow_reverse;
if (e->len == 0) return 0;
reg->exact = (UChar* )xmalloc(e->len);
CHECK_NULL_RETURN_MEMERR(reg->exact);
xmemcpy(reg->exact, e->s, e->len);
reg->exact_end = reg->exact + e->len;
allow_reverse =
ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
if (e->ignore_case > 0) {
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
r = set_bm_skip(reg->exact, reg->exact_end, reg,
reg->map, &(reg->int_map), 1);
if (r == 0) {
reg->optimize = (allow_reverse != 0
? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
}
else {
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
}
}
else {
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
}
}
else {
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
r = set_bm_skip(reg->exact, reg->exact_end, reg,
reg->map, &(reg->int_map), 0);
if (r) return r;
reg->optimize = (allow_reverse != 0
? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
}
else {
reg->optimize = ONIG_OPTIMIZE_EXACT;
}
}
reg->dmin = e->mmd.min;
reg->dmax = e->mmd.max;
if (reg->dmin != ONIG_INFINITE_DISTANCE) {
reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact));
}
return 0;
}
static void
set_optimize_map_info(regex_t* reg, OptMapInfo* m)
{
int i;
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
reg->map[i] = m->map[i];
reg->optimize = ONIG_OPTIMIZE_MAP;
reg->dmin = m->mmd.min;
reg->dmax = m->mmd.max;
if (reg->dmin != ONIG_INFINITE_DISTANCE) {
reg->threshold_len = (int )(reg->dmin + 1);
}
}
static void
set_sub_anchor(regex_t* reg, OptAncInfo* anc)
{
reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
}
#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
static void print_optimize_info(FILE* f, regex_t* reg);
#endif
static int
set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
{
int r;
NodeOptInfo opt;
OptEnv env;
env.enc = reg->enc;
env.options = reg->options;
env.case_fold_flag = reg->case_fold_flag;
env.scan_env = scan_env;
clear_mml(&env.mmd);
r = optimize_node_left(node, &opt, &env);
if (r) return r;
reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
ANCHOR_LOOK_BEHIND);
reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
ANCHOR_PREC_READ_NOT);
if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
reg->anchor_dmin = opt.len.min;
reg->anchor_dmax = opt.len.max;
}
if (opt.exb.len > 0 || opt.exm.len > 0) {
select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
if (opt.map.value > 0 &&
comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
goto set_map;
}
else {
r = set_optimize_exact_info(reg, &opt.exb);
set_sub_anchor(reg, &opt.exb.anc);
}
}
else if (opt.map.value > 0) {
set_map:
set_optimize_map_info(reg, &opt.map);
set_sub_anchor(reg, &opt.map.anc);
}
else {
reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
if (opt.len.max == 0)
reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
}
#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
print_optimize_info(stderr, reg);
#endif
return r;
}
static void
clear_optimize_info(regex_t* reg)
{
reg->optimize = ONIG_OPTIMIZE_NONE;
reg->anchor = 0;
reg->anchor_dmin = 0;
reg->anchor_dmax = 0;
reg->sub_anchor = 0;
reg->exact_end = (UChar* )NULL;
reg->threshold_len = 0;
if (IS_NOT_NULL(reg->exact)) {
xfree(reg->exact);
reg->exact = (UChar* )NULL;
}
}
#ifdef ONIG_DEBUG
static void print_enc_string(FILE* fp, OnigEncoding enc,
const UChar *s, const UChar *end)
{
fprintf(fp, "\nPATTERN: /");
if (ONIGENC_MBC_MINLEN(enc) > 1) {
const UChar *p;
OnigCodePoint code;
p = s;
while (p < end) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (code >= 0x80) {
fprintf(fp, " 0x%04x ", (int )code);
}
else {
fputc((int )code, fp);
}
p += enclen(enc, p, end);
}
}
else {
while (s < end) {
fputc((int )*s, fp);
s++;
}
}
fprintf(fp, "/ (%s)\n", enc->name);
}
#endif /* ONIG_DEBUG */
#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
static void
print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
{
if (a == ONIG_INFINITE_DISTANCE)
fputs("inf", f);
else
fprintf(f, "(%"PRIuPTR")", a);
fputs("-", f);
if (b == ONIG_INFINITE_DISTANCE)
fputs("inf", f);
else
fprintf(f, "(%"PRIuPTR")", b);
}
static void
print_anchor(FILE* f, int anchor)
{
int q = 0;
fprintf(f, "[");
if (anchor & ANCHOR_BEGIN_BUF) {
fprintf(f, "begin-buf");
q = 1;
}
if (anchor & ANCHOR_BEGIN_LINE) {
if (q) fprintf(f, ", ");
q = 1;
fprintf(f, "begin-line");
}
if (anchor & ANCHOR_BEGIN_POSITION) {
if (q) fprintf(f, ", ");
q = 1;
fprintf(f, "begin-pos");
}
if (anchor & ANCHOR_END_BUF) {
if (q) fprintf(f, ", ");
q = 1;
fprintf(f, "end-buf");
}
if (anchor & ANCHOR_SEMI_END_BUF) {
if (q) fprintf(f, ", ");
q = 1;
fprintf(f, "semi-end-buf");
}
if (anchor & ANCHOR_END_LINE) {
if (q) fprintf(f, ", ");
q = 1;
fprintf(f, "end-line");
}
if (anchor & ANCHOR_ANYCHAR_STAR) {
if (q) fprintf(f, ", ");
q = 1;
fprintf(f, "anychar-star");
}
if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
if (q) fprintf(f, ", ");
fprintf(f, "anychar-star-ml");
}
fprintf(f, "]");
}
static void
print_optimize_info(FILE* f, regex_t* reg)
{
static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
"EXACT_IC", "MAP",
"EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" };
fprintf(f, "optimize: %s\n", on[reg->optimize]);
fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
fprintf(f, "\n");
if (reg->optimize) {
fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
fprintf(f, "\n");
}
fprintf(f, "\n");
if (reg->exact) {
UChar *p;
fprintf(f, "exact: [");
for (p = reg->exact; p < reg->exact_end; p++) {
fputc(*p, f);
}
fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact));
}
else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
int c, i, n = 0;
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
if (reg->map[i]) n++;
fprintf(f, "map: n=%d\n", n);
if (n > 0) {
c = 0;
fputc('[', f);
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
if (reg->map[i] != 0) {
if (c > 0) fputs(", ", f);
c++;
if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
fputc(i, f);
else
fprintf(f, "%d", i);
}
}
fprintf(f, "]\n");
}
}
}
#endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */
extern void
onig_free_body(regex_t* reg)
{
if (IS_NOT_NULL(reg)) {
if (IS_NOT_NULL(reg->p)) xfree(reg->p);
if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
#ifdef USE_NAMED_GROUP
onig_names_free(reg);
#endif
}
}
extern void
onig_free(regex_t* reg)
{
if (IS_NOT_NULL(reg)) {
onig_free_body(reg);
xfree(reg);
}
}
size_t
onig_memsize(const regex_t *reg)
{
size_t size = sizeof(regex_t);
if (IS_NULL(reg)) return 0;
if (IS_NOT_NULL(reg->p)) size += reg->alloc;
if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
return size;
}
size_t
onig_region_memsize(const OnigRegion *regs)
{
size_t size = sizeof(*regs);
if (IS_NULL(regs)) return 0;
size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end));
return size;
}
#define REGEX_TRANSFER(to,from) do {\
(to)->state = ONIG_STATE_MODIFY;\
onig_free_body(to);\
xmemcpy(to, from, sizeof(regex_t));\
xfree(from);\
} while (0)
extern void
onig_transfer(regex_t* to, regex_t* from)
{
THREAD_ATOMIC_START;
REGEX_TRANSFER(to, from);
THREAD_ATOMIC_END;
}
#define REGEX_CHAIN_HEAD(reg) do {\
while (IS_NOT_NULL((reg)->chain)) {\
(reg) = (reg)->chain;\
}\
} while (0)
extern void
onig_chain_link_add(regex_t* to, regex_t* add)
{
THREAD_ATOMIC_START;
REGEX_CHAIN_HEAD(to);
to->chain = add;
THREAD_ATOMIC_END;
}
extern void
onig_chain_reduce(regex_t* reg)
{
regex_t *head, *prev;
prev = reg;
head = prev->chain;
if (IS_NOT_NULL(head)) {
reg->state = ONIG_STATE_MODIFY;
while (IS_NOT_NULL(head->chain)) {
prev = head;
head = head->chain;
}
prev->chain = (regex_t* )NULL;
REGEX_TRANSFER(reg, head);
}
}
#ifdef ONIG_DEBUG_COMPILE
static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
#endif
#ifdef ONIG_DEBUG_PARSE_TREE
static void print_tree P_((FILE* f, Node* node));
#endif
extern int
onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
{
#define COMPILE_INIT_SIZE 20
int r;
OnigDistance init_size;
Node* root;
ScanEnv scan_env = {0};
#ifdef USE_SUBEXP_CALL
UnsetAddrList uslist;
#endif
if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
scan_env.sourcefile = sourcefile;
scan_env.sourceline = sourceline;
reg->state = ONIG_STATE_COMPILING;
#ifdef ONIG_DEBUG
print_enc_string(stderr, reg->enc, pattern, pattern_end);
#endif
if (reg->alloc == 0) {
init_size = (pattern_end - pattern) * 2;
if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
r = BBUF_INIT(reg, init_size);
if (r != 0) goto end;
}
else
reg->used = 0;
reg->num_mem = 0;
reg->num_repeat = 0;
reg->num_null_check = 0;
reg->repeat_range_alloc = 0;
reg->repeat_range = (OnigRepeatRange* )NULL;
#ifdef USE_COMBINATION_EXPLOSION_CHECK
reg->num_comb_exp_check = 0;
#endif
r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
if (r != 0) goto err;
#ifdef ONIG_DEBUG_PARSE_TREE
# if 0
fprintf(stderr, "ORIGINAL PARSE TREE:\n");
print_tree(stderr, root);
# endif
#endif
#ifdef USE_NAMED_GROUP
/* mixed use named group and no-named group */
if (scan_env.num_named > 0 &&
IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
!ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
if (scan_env.num_named != scan_env.num_mem)
r = disable_noname_group_capture(&root, reg, &scan_env);
else
r = numbered_ref_check(root);
if (r != 0) goto err;
}
#endif
#ifdef USE_SUBEXP_CALL
if (scan_env.num_call > 0) {
r = unset_addr_list_init(&uslist, scan_env.num_call);
if (r != 0) goto err;
scan_env.unset_addr_list = &uslist;
r = setup_subexp_call(root, &scan_env);
if (r != 0) goto err_unset;
r = subexp_recursive_check_trav(root, &scan_env);
if (r < 0) goto err_unset;
r = subexp_inf_recursive_check_trav(root, &scan_env);
if (r != 0) goto err_unset;
reg->num_call = scan_env.num_call;
}
else
reg->num_call = 0;
#endif
r = setup_tree(root, reg, IN_ROOT, &scan_env);
if (r != 0) goto err_unset;
#ifdef ONIG_DEBUG_PARSE_TREE
print_tree(stderr, root);
#endif
reg->capture_history = scan_env.capture_history;
reg->bt_mem_start = scan_env.bt_mem_start;
reg->bt_mem_start |= reg->capture_history;
if (IS_FIND_CONDITION(reg->options))
BIT_STATUS_ON_ALL(reg->bt_mem_end);
else {
reg->bt_mem_end = scan_env.bt_mem_end;
reg->bt_mem_end |= reg->capture_history;
}
#ifdef USE_COMBINATION_EXPLOSION_CHECK
if (scan_env.backrefed_mem == 0
#ifdef USE_SUBEXP_CALL
|| scan_env.num_call == 0
#endif
) {
setup_comb_exp_check(root, 0, &scan_env);
#ifdef USE_SUBEXP_CALL
if (scan_env.has_recursion != 0) {
scan_env.num_comb_exp_check = 0;
}
else
#endif
if (scan_env.comb_exp_max_regnum > 0) {
int i;
for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
scan_env.num_comb_exp_check = 0;
break;
}
}
}
}
reg->num_comb_exp_check = scan_env.num_comb_exp_check;
#endif
clear_optimize_info(reg);
#ifndef ONIG_DONT_OPTIMIZE
r = set_optimize_info_from_tree(root, reg, &scan_env);
if (r != 0) goto err_unset;
#endif
if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
xfree(scan_env.mem_nodes_dynamic);
scan_env.mem_nodes_dynamic = (Node** )NULL;
}
r = compile_tree(root, reg);
if (r == 0) {
r = add_opcode(reg, OP_END);
#ifdef USE_SUBEXP_CALL
if (scan_env.num_call > 0) {
r = unset_addr_list_fix(&uslist, reg);
unset_addr_list_end(&uslist);
if (r) goto err;
}
#endif
if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
reg->stack_pop_level = STACK_POP_LEVEL_ALL;
else {
if (reg->bt_mem_start != 0)
reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
else
reg->stack_pop_level = STACK_POP_LEVEL_FREE;
}
}
#ifdef USE_SUBEXP_CALL
else if (scan_env.num_call > 0) {
unset_addr_list_end(&uslist);
}
#endif
onig_node_free(root);
#ifdef ONIG_DEBUG_COMPILE
#ifdef USE_NAMED_GROUP
onig_print_names(stderr, reg);
#endif
print_compiled_byte_code_list(stderr, reg);
#endif
end:
reg->state = ONIG_STATE_NORMAL;
return r;
err_unset:
#ifdef USE_SUBEXP_CALL
if (scan_env.num_call > 0) {
unset_addr_list_end(&uslist);
}
#endif
err:
if (IS_NOT_NULL(scan_env.error)) {
if (IS_NOT_NULL(einfo)) {
einfo->enc = scan_env.enc;
einfo->par = scan_env.error;
einfo->par_end = scan_env.error_end;
}
}
onig_node_free(root);
if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
xfree(scan_env.mem_nodes_dynamic);
return r;
}
#ifdef USE_RECOMPILE_API
extern int
onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
OnigErrorInfo* einfo)
{
int r;
regex_t *new_reg;
r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
if (r) return r;
if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
onig_transfer(reg, new_reg);
}
else {
onig_chain_link_add(reg, new_reg);
}
return 0;
}
#endif
static int onig_inited = 0;
extern int
onig_reg_init(regex_t* reg, OnigOptionType option,
OnigCaseFoldType case_fold_flag,
OnigEncoding enc, const OnigSyntaxType* syntax)
{
if (! onig_inited)
onig_init();
if (IS_NULL(reg))
return ONIGERR_INVALID_ARGUMENT;
if (ONIGENC_IS_UNDEF(enc))
return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
== (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
}
(reg)->state = ONIG_STATE_MODIFY;
if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
option |= syntax->options;
option &= ~ONIG_OPTION_SINGLELINE;
}
else
option |= syntax->options;
(reg)->enc = enc;
(reg)->options = option;
(reg)->syntax = syntax;
(reg)->optimize = 0;
(reg)->exact = (UChar* )NULL;
(reg)->int_map = (int* )NULL;
(reg)->int_map_backward = (int* )NULL;
(reg)->chain = (regex_t* )NULL;
(reg)->p = (UChar* )NULL;
(reg)->alloc = 0;
(reg)->used = 0;
(reg)->name_table = (void* )NULL;
(reg)->case_fold_flag = case_fold_flag;
return 0;
}
extern int
onig_new_without_alloc(regex_t* reg, const UChar* pattern,
const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
OnigSyntaxType* syntax, OnigErrorInfo* einfo)
{
int r;
r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
if (r) return r;
r = onig_compile(reg, pattern, pattern_end, einfo, NULL, 0);
return r;
}
extern int
onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
OnigErrorInfo* einfo)
{
int r;
*reg = (regex_t* )xmalloc(sizeof(regex_t));
if (IS_NULL(*reg)) return ONIGERR_MEMORY;
r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
if (r) goto err;
r = onig_compile(*reg, pattern, pattern_end, einfo, NULL, 0);
if (r) {
err:
onig_free(*reg);
*reg = NULL;
}
return r;
}
extern int
onig_init(void)
{
if (onig_inited != 0)
return 0;
THREAD_SYSTEM_INIT;
THREAD_ATOMIC_START;
onig_inited = 1;
onigenc_init();
/* onigenc_set_default_caseconv_table((UChar* )0); */
#ifdef ONIG_DEBUG_STATISTICS
onig_statistics_init();
#endif
THREAD_ATOMIC_END;
return 0;
}
static OnigEndCallListItemType* EndCallTop;
extern void onig_add_end_call(void (*func)(void))
{
OnigEndCallListItemType* item;
item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
if (item == 0) return ;
item->next = EndCallTop;
item->func = func;
EndCallTop = item;
}
static void
exec_end_call_list(void)
{
OnigEndCallListItemType* prev;
void (*func)(void);
while (EndCallTop != 0) {
func = EndCallTop->func;
(*func)();
prev = EndCallTop;
EndCallTop = EndCallTop->next;
xfree(prev);
}
}
extern int
onig_end(void)
{
THREAD_ATOMIC_START;
exec_end_call_list();
#ifdef ONIG_DEBUG_STATISTICS
onig_print_statistics(stderr);
#endif
#ifdef USE_SHARED_CCLASS_TABLE
onig_free_shared_cclass_table();
#endif
#ifdef USE_PARSE_TREE_NODE_RECYCLE
onig_free_node_list();
#endif
onig_inited = 0;
THREAD_ATOMIC_END;
THREAD_SYSTEM_END;
return 0;
}
extern int
onig_is_in_code_range(const UChar* p, OnigCodePoint code)
{
OnigCodePoint n, *data;
OnigCodePoint low, high, x;
GET_CODE_POINT(n, p);
data = (OnigCodePoint* )p;
data++;
for (low = 0, high = n; low < high; ) {
x = (low + high) >> 1;
if (code > data[x * 2 + 1])
low = x + 1;
else
high = x;
}
return ((low < n && code >= data[low * 2]) ? 1 : 0);
}
extern int
onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
{
int found;
if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
if (IS_NULL(cc->mbuf)) {
found = 0;
}
else {
found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
}
}
else {
found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
}
if (IS_NCCLASS_NOT(cc))
return !found;
else
return found;
}
extern int
onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
{
int len;
if (ONIGENC_MBC_MINLEN(enc) > 1) {
len = 2;
}
else {
len = ONIGENC_CODE_TO_MBCLEN(enc, code);
}
return onig_is_code_in_cc_len(len, code, cc);
}
#ifdef ONIG_DEBUG
/* arguments type */
#define ARG_SPECIAL -1
#define ARG_NON 0
#define ARG_RELADDR 1
#define ARG_ABSADDR 2
#define ARG_LENGTH 3
#define ARG_MEMNUM 4
#define ARG_OPTION 5
#define ARG_STATE_CHECK 6
OnigOpInfoType OnigOpInfo[] = {
{ OP_FINISH, "finish", ARG_NON },
{ OP_END, "end", ARG_NON },
{ OP_EXACT1, "exact1", ARG_SPECIAL },
{ OP_EXACT2, "exact2", ARG_SPECIAL },
{ OP_EXACT3, "exact3", ARG_SPECIAL },
{ OP_EXACT4, "exact4", ARG_SPECIAL },
{ OP_EXACT5, "exact5", ARG_SPECIAL },
{ OP_EXACTN, "exactn", ARG_SPECIAL },
{ OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
{ OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
{ OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
{ OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
{ OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
{ OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
{ OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
{ OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
{ OP_CCLASS, "cclass", ARG_SPECIAL },
{ OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
{ OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
{ OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
{ OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
{ OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
{ OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
{ OP_ANYCHAR, "anychar", ARG_NON },
{ OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
{ OP_ANYCHAR_STAR, "anychar*", ARG_NON },
{ OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
{ OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
{ OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
{ OP_WORD, "word", ARG_NON },
{ OP_NOT_WORD, "not-word", ARG_NON },
{ OP_WORD_BOUND, "word-bound", ARG_NON },
{ OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
{ OP_WORD_BEGIN, "word-begin", ARG_NON },
{ OP_WORD_END, "word-end", ARG_NON },
{ OP_ASCII_WORD, "ascii-word", ARG_NON },
{ OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON },
{ OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON },
{ OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON },
{ OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON },
{ OP_ASCII_WORD_END, "ascii-word-end", ARG_NON },
{ OP_BEGIN_BUF, "begin-buf", ARG_NON },
{ OP_END_BUF, "end-buf", ARG_NON },
{ OP_BEGIN_LINE, "begin-line", ARG_NON },
{ OP_END_LINE, "end-line", ARG_NON },
{ OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
{ OP_BEGIN_POSITION, "begin-position", ARG_NON },
{ OP_BEGIN_POS_OR_LINE, "begin-pos-or-line", ARG_NON },
{ OP_BACKREF1, "backref1", ARG_NON },
{ OP_BACKREF2, "backref2", ARG_NON },
{ OP_BACKREFN, "backrefn", ARG_MEMNUM },
{ OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
{ OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
{ OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
{ OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
{ OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
{ OP_MEMORY_START, "mem-start", ARG_MEMNUM },
{ OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
{ OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
{ OP_MEMORY_END, "mem-end", ARG_MEMNUM },
{ OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
{ OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
{ OP_SET_OPTION, "set-option", ARG_OPTION },
{ OP_KEEP, "keep", ARG_NON },
{ OP_FAIL, "fail", ARG_NON },
{ OP_JUMP, "jump", ARG_RELADDR },
{ OP_PUSH, "push", ARG_RELADDR },
{ OP_POP, "pop", ARG_NON },
{ OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
{ OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
{ OP_REPEAT, "repeat", ARG_SPECIAL },
{ OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
{ OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
{ OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
{ OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
{ OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
{ OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
{ OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
{ OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
{ OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
{ OP_PUSH_POS, "push-pos", ARG_NON },
{ OP_POP_POS, "pop-pos", ARG_NON },
{ OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
{ OP_FAIL_POS, "fail-pos", ARG_NON },
{ OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
{ OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
{ OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
{ OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
{ OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
{ OP_CALL, "call", ARG_ABSADDR },
{ OP_RETURN, "return", ARG_NON },
{ OP_CONDITION, "condition", ARG_SPECIAL },
{ OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
{ OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
{ OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
{ OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
{ OP_STATE_CHECK_ANYCHAR_ML_STAR,
"state-check-anychar-ml*", ARG_STATE_CHECK },
{ -1, "", ARG_NON }
};
static const char*
op2name(int opcode)
{
int i;
for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
if (opcode == OnigOpInfo[i].opcode)
return OnigOpInfo[i].name;
}
return "";
}
static int
op2arg_type(int opcode)
{
int i;
for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
if (opcode == OnigOpInfo[i].opcode)
return OnigOpInfo[i].arg_type;
}
return ARG_SPECIAL;
}
#ifdef ONIG_DEBUG_PARSE_TREE
static void
Indent(FILE* f, int indent)
{
int i;
for (i = 0; i < indent; i++) putc(' ', f);
}
#endif /* ONIG_DEBUG_PARSE_TREE */
static void
p_string(FILE* f, ptrdiff_t len, UChar* s)
{
fputs(":", f);
while (len-- > 0) { fputc(*s++, f); }
}
static void
p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
{
int x = len * mb_len;
fprintf(f, ":%d:", len);
while (x-- > 0) { fputc(*s++, f); }
}
extern void
onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
OnigEncoding enc)
{
int i, n, arg_type;
RelAddrType addr;
LengthType len;
MemNumType mem;
StateCheckNumType scn;
OnigCodePoint code;
UChar *q;
fprintf(f, "[%s", op2name(*bp));
arg_type = op2arg_type(*bp);
if (arg_type != ARG_SPECIAL) {
bp++;
switch (arg_type) {
case ARG_NON:
break;
case ARG_RELADDR:
GET_RELADDR_INC(addr, bp);
fprintf(f, ":(+%d)", addr);
break;
case ARG_ABSADDR:
GET_ABSADDR_INC(addr, bp);
fprintf(f, ":(%d)", addr);
break;
case ARG_LENGTH:
GET_LENGTH_INC(len, bp);
fprintf(f, ":%d", len);
break;
case ARG_MEMNUM:
mem = *((MemNumType* )bp);
bp += SIZE_MEMNUM;
fprintf(f, ":%d", mem);
break;
case ARG_OPTION:
{
OnigOptionType option = *((OnigOptionType* )bp);
bp += SIZE_OPTION;
fprintf(f, ":%d", option);
}
break;
case ARG_STATE_CHECK:
scn = *((StateCheckNumType* )bp);
bp += SIZE_STATE_CHECK_NUM;
fprintf(f, ":%d", scn);
break;
}
}
else {
switch (*bp++) {
case OP_EXACT1:
case OP_ANYCHAR_STAR_PEEK_NEXT:
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
p_string(f, 1, bp++); break;
case OP_EXACT2:
p_string(f, 2, bp); bp += 2; break;
case OP_EXACT3:
p_string(f, 3, bp); bp += 3; break;
case OP_EXACT4:
p_string(f, 4, bp); bp += 4; break;
case OP_EXACT5:
p_string(f, 5, bp); bp += 5; break;
case OP_EXACTN:
GET_LENGTH_INC(len, bp);
p_len_string(f, len, 1, bp);
bp += len;
break;
case OP_EXACTMB2N1:
p_string(f, 2, bp); bp += 2; break;
case OP_EXACTMB2N2:
p_string(f, 4, bp); bp += 4; break;
case OP_EXACTMB2N3:
p_string(f, 6, bp); bp += 6; break;
case OP_EXACTMB2N:
GET_LENGTH_INC(len, bp);
p_len_string(f, len, 2, bp);
bp += len * 2;
break;
case OP_EXACTMB3N:
GET_LENGTH_INC(len, bp);
p_len_string(f, len, 3, bp);
bp += len * 3;
break;
case OP_EXACTMBN:
{
int mb_len;
GET_LENGTH_INC(mb_len, bp);
GET_LENGTH_INC(len, bp);
fprintf(f, ":%d:%d:", mb_len, len);
n = len * mb_len;
while (n-- > 0) { fputc(*bp++, f); }
}
break;
case OP_EXACT1_IC:
len = enclen(enc, bp, bpend);
p_string(f, len, bp);
bp += len;
break;
case OP_EXACTN_IC:
GET_LENGTH_INC(len, bp);
p_len_string(f, len, 1, bp);
bp += len;
break;
case OP_CCLASS:
n = bitset_on_num((BitSetRef )bp);
bp += SIZE_BITSET;
fprintf(f, ":%d", n);
break;
case OP_CCLASS_NOT:
n = bitset_on_num((BitSetRef )bp);
bp += SIZE_BITSET;
fprintf(f, ":%d", n);
break;
case OP_CCLASS_MB:
case OP_CCLASS_MB_NOT:
GET_LENGTH_INC(len, bp);
q = bp;
#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
ALIGNMENT_RIGHT(q);
#endif
GET_CODE_POINT(code, q);
bp += len;
fprintf(f, ":%d:%d", (int )code, len);
break;
case OP_CCLASS_MIX:
case OP_CCLASS_MIX_NOT:
n = bitset_on_num((BitSetRef )bp);
bp += SIZE_BITSET;
GET_LENGTH_INC(len, bp);
q = bp;
#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
ALIGNMENT_RIGHT(q);
#endif
GET_CODE_POINT(code, q);
bp += len;
fprintf(f, ":%d:%d:%d", n, (int )code, len);
break;
case OP_CCLASS_NODE:
{
CClassNode *cc;
GET_POINTER_INC(cc, bp);
n = bitset_on_num(cc->bs);
fprintf(f, ":%"PRIuPTR":%d", (uintptr_t )cc, n);
}
break;
case OP_BACKREFN_IC:
mem = *((MemNumType* )bp);
bp += SIZE_MEMNUM;
fprintf(f, ":%d", mem);
break;
case OP_BACKREF_MULTI_IC:
case OP_BACKREF_MULTI:
fputs(" ", f);
GET_LENGTH_INC(len, bp);
for (i = 0; i < len; i++) {
GET_MEMNUM_INC(mem, bp);
if (i > 0) fputs(", ", f);
fprintf(f, "%d", mem);
}
break;
case OP_BACKREF_WITH_LEVEL:
{
OnigOptionType option;
LengthType level;
GET_OPTION_INC(option, bp);
fprintf(f, ":%d", option);
GET_LENGTH_INC(level, bp);
fprintf(f, ":%d", level);
fputs(" ", f);
GET_LENGTH_INC(len, bp);
for (i = 0; i < len; i++) {
GET_MEMNUM_INC(mem, bp);
if (i > 0) fputs(", ", f);
fprintf(f, "%d", mem);
}
}
break;
case OP_REPEAT:
case OP_REPEAT_NG:
{
mem = *((MemNumType* )bp);
bp += SIZE_MEMNUM;
addr = *((RelAddrType* )bp);
bp += SIZE_RELADDR;
fprintf(f, ":%d:%d", mem, addr);
}
break;
case OP_PUSH_OR_JUMP_EXACT1:
case OP_PUSH_IF_PEEK_NEXT:
addr = *((RelAddrType* )bp);
bp += SIZE_RELADDR;
fprintf(f, ":(%d)", addr);
p_string(f, 1, bp);
bp += 1;
break;
case OP_LOOK_BEHIND:
GET_LENGTH_INC(len, bp);
fprintf(f, ":%d", len);
break;
case OP_PUSH_LOOK_BEHIND_NOT:
GET_RELADDR_INC(addr, bp);
GET_LENGTH_INC(len, bp);
fprintf(f, ":%d:(%d)", len, addr);
break;
case OP_STATE_CHECK_PUSH:
case OP_STATE_CHECK_PUSH_OR_JUMP:
scn = *((StateCheckNumType* )bp);
bp += SIZE_STATE_CHECK_NUM;
addr = *((RelAddrType* )bp);
bp += SIZE_RELADDR;
fprintf(f, ":%d:(%d)", scn, addr);
break;
case OP_CONDITION:
GET_MEMNUM_INC(mem, bp);
GET_RELADDR_INC(addr, bp);
fprintf(f, ":%d:(%d)", mem, addr);
break;
default:
fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
*--bp);
}
}
fputs("]", f);
if (nextp) *nextp = bp;
}
#ifdef ONIG_DEBUG_COMPILE
static void
print_compiled_byte_code_list(FILE* f, regex_t* reg)
{
int ncode;
UChar* bp = reg->p;
UChar* end = reg->p + reg->used;
fprintf(f, "code length: %d", reg->used);
ncode = -1;
while (bp < end) {
ncode++;
if (ncode % 5 == 0)
fprintf(f, "\n%ld:", bp - reg->p);
else
fprintf(f, " %ld:", bp - reg->p);
onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
}
fprintf(f, "\n");
}
#endif /* ONIG_DEBUG_COMPILE */
#ifdef ONIG_DEBUG_PARSE_TREE
void
print_indent_tree(FILE* f, Node* node, int indent)
{
int i, type, container_p = 0;
int add = 3;
UChar* p;
Indent(f, indent);
if (IS_NULL(node)) {
fprintf(f, "ERROR: null node!!!\n");
exit (0);
}
type = NTYPE(node);
switch (type) {
case NT_LIST:
case NT_ALT:
if (NTYPE(node) == NT_LIST)
fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node);
else
fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node);
print_indent_tree(f, NCAR(node), indent + add);
while (IS_NOT_NULL(node = NCDR(node))) {
if (NTYPE(node) != type) {
fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
exit(0);
}
print_indent_tree(f, NCAR(node), indent + add);
}
break;
case NT_STR:
fprintf(f, "<string%s:%"PRIxPTR">",
(NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node);
for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
if (*p >= 0x20 && *p < 0x7f)
fputc(*p, f);
else {
fprintf(f, " 0x%02x", *p);
}
}
break;
case NT_CCLASS:
fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node);
if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f);
if (NCCLASS(node)->mbuf) {
BBuf* bbuf = NCCLASS(node)->mbuf;
OnigCodePoint* data = (OnigCodePoint*)bbuf->p;
OnigCodePoint* end = (OnigCodePoint*)(bbuf->p + bbuf->used);
fprintf(f, "%d", *data++);
for (; data < end; data+=2) {
fprintf(f, ",");
fprintf(f, "%04x-%04x", data[0], data[1]);
}
}
break;
case NT_CTYPE:
fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node);
switch (NCTYPE(node)->ctype) {
case ONIGENC_CTYPE_WORD:
if (NCTYPE(node)->not != 0)
fputs("not word", f);
else
fputs("word", f);
break;
default:
fprintf(f, "ERROR: undefined ctype.\n");
exit(0);
}
break;
case NT_CANY:
fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node);
break;
case NT_ANCHOR:
fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node);
switch (NANCHOR(node)->type) {
case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
case ANCHOR_END_BUF: fputs("end buf", f); break;
case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
case ANCHOR_END_LINE: fputs("end line", f); break;
case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
case ANCHOR_ANYCHAR_STAR: fputs("begin position/line", f); break;
case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
#ifdef USE_WORD_BEGIN_END
case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
case ANCHOR_WORD_END: fputs("word end", f); break;
#endif
case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break;
case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break;
case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break;
case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break;
case ANCHOR_KEEP: fputs("keep",f); break;
default:
fprintf(f, "ERROR: undefined anchor type.\n");
break;
}
break;
case NT_BREF:
{
int* p;
BRefNode* br = NBREF(node);
p = BACKREFS_P(br);
fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node);
for (i = 0; i < br->back_num; i++) {
if (i > 0) fputs(", ", f);
fprintf(f, "%d", p[i]);
}
}
break;
#ifdef USE_SUBEXP_CALL
case NT_CALL:
{
CallNode* cn = NCALL(node);
fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node);
p_string(f, cn->name_end - cn->name, cn->name);
}
break;
#endif
case NT_QTFR:
fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node,
NQTFR(node)->lower, NQTFR(node)->upper,
(NQTFR(node)->greedy ? "" : "?"));
print_indent_tree(f, NQTFR(node)->target, indent + add);
break;
case NT_ENCLOSE:
fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node);
switch (NENCLOSE(node)->type) {
case ENCLOSE_OPTION:
fprintf(f, "option:%d", NENCLOSE(node)->option);
break;
case ENCLOSE_MEMORY:
fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
break;
case ENCLOSE_STOP_BACKTRACK:
fprintf(f, "stop-bt");
break;
case ENCLOSE_CONDITION:
fprintf(f, "condition:%d", NENCLOSE(node)->regnum);
break;
default:
break;
}
fprintf(f, "\n");
print_indent_tree(f, NENCLOSE(node)->target, indent + add);
break;
default:
fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
break;
}
if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
type != NT_ENCLOSE)
fprintf(f, "\n");
if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add);
fflush(f);
}
static void
print_tree(FILE* f, Node* node)
{
print_indent_tree(f, node, 0);
}
#endif /* ONIG_DEBUG_PARSE_TREE */
#endif /* ONIG_DEBUG */