sortix--sortix/libc/regex/regcomp.cpp

/*******************************************************************************

    Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.

    This file is part of the Sortix C Library.

    The Sortix C Library is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or (at your
    option) any later version.

    The Sortix C Library is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
    License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.

    regex/regcomp.cpp
    Regular expression compiler.

*******************************************************************************/

#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <regex.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

struct re_parse_subexpr
{
	struct re_parse_subexpr* next;
	struct re** prev_next_ptr;
	struct re** primary_next_ptr;
};

struct re_parse
{
	struct re_parse_subexpr* subexpr;
	size_t subexpr_num;
};

static inline bool re_basic_well_defined_escape(char c)
{
	return c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' ||
	       c == '.' || c == '*' || c == '[' || c == ']' || c == '^' ||
	       c == '$' || c == '+' || c == '?' || c == '|' ||
	       ('0' <= c && c <= '9');
}

static inline bool re_extended_well_defined_escape(char c)
{
	return c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' ||
	       c == '.' || c == '*' || c == '[' || c == ']' || c == '^' ||
	       c == '$' || c == '+' || c == '?' || c == '|';
}

static inline void re_free(struct re* re)
{
	regex_t regex;
	memset(&regex, 0, sizeof(regex));
	pthread_mutex_init(&regex.re_lock, NULL);
	regex.re = re;
	regfree(&regex);
}

static inline int re_parse(struct re_parse* parse,
                           struct re** restrict prev_next_ptr,
                           const char* restrict pattern,
                           int cflags)
{
	*prev_next_ptr = NULL;

	bool is_extended = cflags & REG_EXTENDED;
	bool is_basic = !is_extended;

	struct re** primary_next_ptr = prev_next_ptr;
	struct re* re;

	size_t pattern_index = 0;
	//size_t alternative_begun_at = pattern_index;
	while ( true )
	{
		size_t c_pattern_index = pattern_index++;
		char c = pattern[c_pattern_index];

		if ( c == '\0' )
		{
			if ( parse->subexpr )
				return REG_EPAREN;
			return 0;
		}

		bool escaped = false;
		if ( c == '\\' )
		{
			c_pattern_index = pattern_index++;
			c = pattern[c_pattern_index];
			if ( c == '\0' )
				return REG_BADPAT;
			if ( is_basic && !re_basic_well_defined_escape(c) )
				return REG_BADPAT;
			if ( is_extended && !re_extended_well_defined_escape(c) )
				return REG_BADPAT;
			escaped = true;
		}

		bool escaped_for_basic = (is_basic && escaped) ||
		                         (is_extended && !escaped);

		if ( escaped_for_basic && c == ')' )
		{
			struct re_parse_subexpr* subexpr = parse->subexpr;
			if ( !subexpr )
				return REG_EPAREN;
			*prev_next_ptr = NULL;
			prev_next_ptr = subexpr->prev_next_ptr;
			primary_next_ptr = subexpr->primary_next_ptr;
			//alternative_begun_at = subexpr->alternative_begun_at;
			parse->subexpr = subexpr->next;
			free(subexpr);
			re = *prev_next_ptr;
			goto subexpression_done;
		}

		// TODO: Properly reject anchors in the basic regular expression cases
		//       where they aren't appropriate. Mind that we implement the
		//       extension where all ERE features are available in BRE mode if
		//       accessed through backslashes.
		//if ( !escaped && c == '^' &&
		//     (0 < parse->subexpr_depth || c_pattern_index != alternative_begun_at) )
		//	return REG_BADRPT;
		//if ( !escaped && c == '$' &&
		//     (0 < parse->subexpr_depth || pattern[pattern_index] != '0') )
		//	return REG_BADRPT;
		if ( !escaped && c == '*' )
			return REG_BADRPT;
		if ( escaped_for_basic && c == '{' )
			return REG_BADBR;
		if ( (is_basic && escaped && c == '+') ||
		     (is_extended && !escaped && c == '+') )
			return REG_BADBR;
		if ( (is_basic && escaped && c == '?') ||
		     (is_extended && !escaped && c == '?') )
			return REG_BADBR;

		if ( !(re = (struct re*) calloc(1, sizeof(struct re))) )
			return REG_ESPACE;

		if ( escaped_for_basic && c == '|' )
		{
			re->re_type = RE_TYPE_ALTERNATIVE;
			re->re_next_owner = *primary_next_ptr;
			re->re_split.re_owner = NULL;
			*primary_next_ptr = re;
			prev_next_ptr = primary_next_ptr = &re->re_split.re_owner;
			continue;
		}
		// TODO: Check if this anchor logic is the right one. This uses them as
		//       special characters in BRE mode in cases they shouldn't be.
		else if ( !escaped && c == '^' )
		{
			re->re_type = RE_TYPE_BOL;
			*prev_next_ptr = re;
			prev_next_ptr = &re->re_next_owner;
			continue;
		}
		else if ( !escaped && c == '$' )
		{
			re->re_type = RE_TYPE_EOL;
			*prev_next_ptr = re;
			prev_next_ptr = &re->re_next_owner;
			continue;
		}
		else if ( escaped_for_basic && c == '(' )
		{
			re->re_type = RE_TYPE_SUBEXPRESSION;
			re->re_subexpression.index = parse->subexpr_num++;
			re->re_subexpression.re_owner = NULL;
			*prev_next_ptr = re;
			struct re* end = (struct re*) calloc(1, sizeof(struct re));
			if ( !end )
				return REG_ESPACE;
			end->re_type = RE_TYPE_SUBEXPRESSION_END;
			end->re_subexpression.index = re->re_subexpression.index;
			re->re_next_owner = end;
			struct re_parse_subexpr* subexpr = (struct re_parse_subexpr*)
				calloc(sizeof(struct re_parse_subexpr), 1);
			if ( !subexpr )
				return REG_ESPACE;
			subexpr->prev_next_ptr = prev_next_ptr;
			subexpr->primary_next_ptr = primary_next_ptr;
			//subexpr->alternative_begun_at = alternative_begun_at;
			subexpr->next = parse->subexpr;
			parse->subexpr = subexpr;
			prev_next_ptr = &re->re_subexpression.re_owner;
			primary_next_ptr = &re->re_subexpression.re_owner;
			//alternative_begun_at = pattern_index;
			continue;
		}
		// TODO: This is not properly implemented.
		// TODO: This is not properly unicode-aware.
		else if ( c == '[' )
		{
			re->re_type = RE_TYPE_SET;
			bool negate = false;
			if ( pattern[pattern_index] == '^' )
			{
				pattern_index += 1;
				negate = true;
			}
			while ( pattern[pattern_index] != ']' )
			{
				if ( pattern[pattern_index] == '\0' )
					return free(re), REG_EBRACK;
				// TODO: This is wrong and fragile.
				unsigned char c_from;
				unsigned char c_to;
				if ( pattern[pattern_index + 1] == '-' )
				{
					c_from = (unsigned char) pattern[pattern_index + 0];
					c_to = (unsigned char) pattern[pattern_index + 2];
					pattern_index += 3;
				}
				else
				{
					c_from = (unsigned char) pattern[pattern_index + 0];
					c_to = (unsigned char) pattern[pattern_index + 0];
					pattern_index += 1;
				}
				for ( unsigned int uc = c_from; uc <= c_to; uc++ )
				{
					size_t byte_index = uc / 8;
					size_t bit_index = uc % 8;
					re->re_set.set[byte_index] |= (1 << bit_index);
				}
			}
			if ( negate )
			{
				for ( size_t i = 0; i < 32; i++ )
					re->re_set.set[i] = ~re->re_set.set[i];
			}
			if ( pattern[pattern_index++] != ']' )
				return free(re), REG_EBRACK;
		}
		else if ( escaped && ('0' <= c && c <= '9') )
		{
			// TODO: This isn't implemented yet (not part of ERE).
			return free(re), REG_BADPAT;
		}
		else if ( !escaped && c == '.' )
			re->re_type = RE_TYPE_ANY_CHAR;
		else
		{
			re->re_type = RE_TYPE_CHAR;
			re->re_char.c = c;
		}

		*prev_next_ptr = re;

subexpression_done:
		if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
		                  pattern[pattern_index + 1] == '{') ||
		     (is_extended && pattern[pattern_index] == '{' ) )
		{
			pattern_index += is_extended ? 1 : 2;
			if ( pattern[pattern_index] < '0' ||
			     pattern[pattern_index] > '9' )
				return REG_BADBR;
			uintmax_t repeat_min;
			uintmax_t repeat_max;
			const char* value;
			const char* value_end;
			int saved_errno = errno;
			value = (char*) (pattern + pattern_index);
			repeat_min = strtoumax((char*) value, (char**) &value_end, 10);
			int parse_errno = errno;
			errno = saved_errno;
			if ( parse_errno == ERANGE || SIZE_MAX < repeat_min )
				return REG_BADBR;
			pattern_index += value_end - value;
			if ( pattern[pattern_index] == ',' )
			{
				repeat_max = SIZE_MAX;
				pattern_index += 1;
				if ( pattern[pattern_index] >= '0' &&
				     pattern[pattern_index] <= '9' )
				{
					saved_errno = errno;
					value = (char*) (pattern + pattern_index);
					repeat_max = strtoumax((char*) value, (char**) &value_end, 10);
					parse_errno = errno;
					errno = saved_errno;
					if ( parse_errno == ERANGE || SIZE_MAX < repeat_max )
						return  REG_BADBR;
					if ( repeat_max < repeat_min )
						return REG_BADBR;
					pattern_index += value_end - value;
				}
			}
			else
			{
				repeat_max = repeat_min;
			}
			if ( (is_basic && pattern[pattern_index++] != '\\') ||
			     pattern[pattern_index++] != '}' )
				return REG_BADBR;
			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
			if ( !re_repetition )
				return REG_ESPACE;
			re_repetition->re_type = RE_TYPE_REPETITION;
			re_repetition->re_repetition.re = re;
			re_repetition->re_repetition.min = (size_t) repeat_min;
			re_repetition->re_repetition.max = (size_t) repeat_max;
			*prev_next_ptr = re_repetition;
			re = re_repetition;
		}
		else if ( pattern[pattern_index] == '*' )
		{
			pattern_index += 1;
			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
			if ( !re_repetition )
				return REG_ESPACE;
			re_repetition->re_type = RE_TYPE_REPETITION;
			re_repetition->re_repetition.re = re;
			re_repetition->re_repetition.min = 0;
			re_repetition->re_repetition.max = SIZE_MAX;
			*prev_next_ptr = re_repetition;
			re = re_repetition;
		}
		else if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
		                       pattern[pattern_index + 1] == '?') ||
		          (is_extended && pattern[pattern_index] == '?' ) )
		{
			pattern_index += is_extended ? 1 : 2;
			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
			if ( !re_repetition )
				return REG_ESPACE;
			re_repetition->re_type = RE_TYPE_REPETITION;
			re_repetition->re_repetition.re = re;
			re_repetition->re_repetition.min = 0;
			re_repetition->re_repetition.max = 1;
			*prev_next_ptr = re_repetition;
			re = re_repetition;
		}
		else if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
		                       pattern[pattern_index + 1] == '+') ||
		          (is_extended && pattern[pattern_index] == '+' ) )
		{
			pattern_index += is_extended ? 1 : 2;
			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
			if ( !re_repetition )
				return REG_ESPACE;
			re_repetition->re_type = RE_TYPE_REPETITION;
			re_repetition->re_repetition.re = re;
			re_repetition->re_repetition.min = 1;
			re_repetition->re_repetition.max = SIZE_MAX;
			*prev_next_ptr = re_repetition;
			re = re_repetition;
		}

		if ( re->re_type == RE_TYPE_SUBEXPRESSION )
			re = re->re_next_owner; // RE_TYPE_SUBEXPRESSION_END.

		prev_next_ptr = &re->re_next_owner;
	}
}

static inline bool re_duplicate(struct re* templ, struct re** re_ptr)
{
	struct re* copy;
	struct re* parent_templ = NULL;
	struct re* parent_copy = NULL;
	while ( true )
	{
		if ( !templ )
		{
			if ( parent_templ )
			{
				templ = parent_templ;
				copy = parent_copy;
				parent_templ = templ->re_upcoming_state_next;
				parent_copy = copy->re_upcoming_state_next;
				templ = templ->re_next_owner;
				re_ptr = &copy->re_next_owner;
				continue;
			}
			return *re_ptr = NULL, true;
		}
		if ( !(copy = (struct re*) calloc(1, sizeof(struct re))) )
			return false;
		*re_ptr = copy;
		copy->re_type = templ->re_type;
		if ( templ->re_type == RE_TYPE_BOL )
			;
		else if ( templ->re_type == RE_TYPE_BOL )
			;
		else if ( templ->re_type == RE_TYPE_CHAR )
			copy->re_char.c = templ->re_char.c;
		else if ( templ->re_type == RE_TYPE_ANY_CHAR )
			;
		else if ( templ->re_type == RE_TYPE_SET )
			memcpy(copy->re_set.set, templ->re_set.set, 32);
		else if ( templ->re_type == RE_TYPE_SUBEXPRESSION )
		{
			copy->re_subexpression.index = templ->re_subexpression.index;
			templ->re_upcoming_state_next = parent_templ;
			copy->re_upcoming_state_next = parent_copy;
			parent_templ = templ;
			parent_copy = copy;
			templ = templ->re_subexpression.re_owner;
			re_ptr = &copy->re_subexpression.re_owner;
			continue;
		}
		else if ( templ->re_type == RE_TYPE_SUBEXPRESSION_END )
			copy->re_subexpression.index = templ->re_subexpression.index;
		else if ( templ->re_type == RE_TYPE_ALTERNATIVE ||
			      templ->re_type == RE_TYPE_OPTIONAL ||
			      templ->re_type == RE_TYPE_LOOP )
		{
			templ->re_upcoming_state_next = parent_templ;
			copy->re_upcoming_state_next = parent_copy;
			parent_templ = templ;
			parent_copy = copy;
			templ = templ->re_split.re_owner;
			re_ptr = &copy->re_split.re_owner;
			continue;
		}
		else if ( templ->re_type == RE_TYPE_REPETITION )
		{
			copy->re_repetition.min = templ->re_repetition.min;
			copy->re_repetition.max = templ->re_repetition.max;
			templ->re_upcoming_state_next = parent_templ;
			copy->re_upcoming_state_next = parent_copy;
			parent_templ = templ;
			parent_copy = copy;
			templ = templ->re_split.re;
			re_ptr = &copy->re_split.re;
			continue;
		}
		else
			assert(false);
		templ = templ->re_next_owner;
		re_ptr = &copy->re_next_owner;
	}
}

static inline bool re_repetition(struct re* templ,
                                 struct re** re_ptr,
                                 size_t min,
                                 size_t max,
                                 struct re* after)
{
	while ( true )
	{
		if ( !max )
			return *re_ptr = after, true;
		struct re* copy = (struct re*) calloc(1, sizeof(struct re));
		if ( !copy )
			return false;
		*re_ptr = copy;
		copy->re_type = templ->re_type;
		if ( templ->re_type == RE_TYPE_BOL )
			;
		else if ( templ->re_type == RE_TYPE_BOL )
			;
		else if ( templ->re_type == RE_TYPE_CHAR )
			copy->re_char.c = templ->re_char.c;
		else if ( templ->re_type == RE_TYPE_ANY_CHAR )
			;
		else if ( templ->re_type == RE_TYPE_SET )
			memcpy(copy->re_set.set, templ->re_set.set, 32);
		else if ( templ->re_type == RE_TYPE_SUBEXPRESSION )
		{
			copy->re_subexpression.index = templ->re_subexpression.index;
			if ( !re_duplicate(templ->re_subexpression.re_owner,
				              &copy->re_subexpression.re_owner) )
				return false;
			struct re* templ_end = templ->re_next_owner;
			assert(templ_end && templ_end->re_type == RE_TYPE_SUBEXPRESSION_END);
			struct re* end = (struct re*) calloc(1, sizeof(struct re));
			if ( !end )
				return false;
			end->re_type = RE_TYPE_SUBEXPRESSION_END;
			end->re_subexpression.index = templ_end->re_subexpression.index;
			copy->re_next_owner = end;
		}
		else
			assert(false);
		if ( 1 <= min )
		{
			while ( copy->re_next_owner )
				copy = copy->re_next_owner;
			re_ptr = &copy->re_next_owner;
			if ( max != SIZE_MAX )
				max--;
			min--;
		}
		else if ( max < SIZE_MAX )
		{
			struct re* wrap = (struct re*) calloc(1, sizeof(struct re));
			if ( !wrap )
				return false;
			wrap->re_type = RE_TYPE_OPTIONAL;
			wrap->re_split.re_owner = copy;
			*re_ptr = wrap;
			re_ptr = &wrap->re_next_owner;
			max--;
		}
		else
		{
			struct re* wrap = (struct re*) calloc(1, sizeof(struct re));
			if ( !wrap )
				return false;
			wrap->re_type = RE_TYPE_LOOP;
			wrap->re_split.re_owner = copy;
			*re_ptr = wrap;
			re_ptr = &wrap->re_next_owner;
			max = 0;
		}
	}
}

static inline bool re_transform(struct re** re_ptr, size_t* state_count_ptr)
{
	if ( !*re_ptr )
	{
		struct re* re;
		if ( !(re = (struct re*) calloc(1, sizeof(struct re))) )
			return false;
		re->re_type = RE_TYPE_BOL;
		*re_ptr = re;
	}

	struct re** parent_ptr = NULL;
	while ( *re_ptr )
	{
		struct re* re = *re_ptr;

		if ( re->re_type == RE_TYPE_REPETITION )
		{
			struct re* templ = re->re_repetition.re;
			size_t min = re->re_repetition.min;
			size_t max = re->re_repetition.max;
			struct re* after = re->re_next_owner;
			struct re* replacement = NULL;
			re->re_next_owner = NULL;
			re_repetition(templ, &replacement, min, max, after);
			re_free(re);
			*re_ptr = re = replacement;
			continue;
		}

		(*state_count_ptr)++;

		if ( re->re_type == RE_TYPE_SUBEXPRESSION &&
		     re->re_subexpression.re_owner )
		{
			re->re_current_state_prev = (struct re*) parent_ptr;
			parent_ptr = re_ptr;
			re_ptr = &re->re_subexpression.re_owner;
			continue;
		}

		if ( (re->re_type == RE_TYPE_ALTERNATIVE ||
		      re->re_type == RE_TYPE_OPTIONAL ||
		      re->re_type == RE_TYPE_LOOP) && re->re_split.re_owner )
		{
			re->re_current_state_prev = (struct re*) parent_ptr;
			parent_ptr = re_ptr;
			re_ptr = &re->re_split.re_owner;
			continue;
		}

		re_ptr = &re->re_next_owner;
		while ( !*re_ptr && parent_ptr )
		{
			re_ptr = parent_ptr;
			parent_ptr = (struct re**) (*re_ptr)->re_current_state_prev;
			re_ptr = &(*re_ptr)->re_next_owner;
		}
	}

	return true;
}

static inline void re_control_flow(struct re* re,
                                   regmatch_t* matches,
                                   size_t matches_per_state,
                                   size_t* state_count_ptr)
{
	struct re* parent = NULL;
	struct re* parent_link = NULL;
	while ( re )
	{
		size_t re_index = (*state_count_ptr)++;
		size_t offset = re_index * matches_per_state;
		re->re_matches = matches + offset;

		if ( re->re_type == RE_TYPE_ALTERNATIVE )
		{
			if ( !re->re_split.re_owner )
				re->re_split.re = parent_link;
			if ( !re->re_next_owner )
				re->re_next = parent_link;
			if ( re->re_split.re_owner && re->re_next_owner )
			{
				re->re_next = re->re_next_owner;
				re->re_current_state_prev = parent;
				re->re_current_state_next = parent_link;
				re->re_upcoming_state_next = re->re_next_owner;
				parent = re;
				re = re->re_split.re = re->re_split.re_owner;
			}
			else if ( re->re_split.re_owner )
				re = re->re_split.re = re->re_split.re_owner;
			else if ( re->re_next_owner )
				re = re->re_next = re->re_next_owner;
			else if ( parent )
			{
				re = parent;
				parent = re->re_current_state_prev;
				parent_link = re->re_current_state_next;
				re = re->re_upcoming_state_next;
			}
			else
				re = NULL;
			continue;
		}

		if ( !re->re_next_owner && parent_link )
			re->re_next = parent_link;
		else
			re->re_next = re->re_next_owner;

		if ( re->re_type == RE_TYPE_LOOP || re->re_type == RE_TYPE_OPTIONAL )
		{
			struct re* inner = re->re_split.re_owner;
			struct re* after = re->re_next;
			re->re_split.re = after;
			re->re_next = inner;
			if ( re->re_next_owner )
			{
				re->re_current_state_prev = parent;
				re->re_current_state_next = parent_link;
				re->re_upcoming_state_next = after;
				parent = re;
			}
			if ( re->re_type == RE_TYPE_LOOP )
				parent_link = re;
			else
				parent_link = after;
			re = inner;
			continue;
		}

		if ( re->re_type == RE_TYPE_SUBEXPRESSION )
		{
			if ( re->re_subexpression.re_owner )
			{
				re->re_current_state_prev = parent;
				re->re_current_state_next = parent_link;
				re->re_upcoming_state_next = re->re_next_owner;
				parent = re;
				parent_link = re->re_next;
				re->re_next = re->re_subexpression.re_owner;
				re = re->re_subexpression.re_owner;
				continue;
			}
		}

		if ( !re->re_next_owner && parent )
		{
			re = parent;
			parent = re->re_current_state_prev;
			parent_link = re->re_current_state_next;
		}

		re = re->re_next_owner;
	}
}

extern "C"
int regcomp(regex_t* restrict regex,
            const char* restrict pattern,
            int cflags)
{
	// TODO: Verify cflags.
	// TODO: Implement REG_ICASE.
	// TODO: Implement REG_NOSUB.
	// TODO: Implement REG_NEWLINE.
	memset(regex, 0, sizeof(*regex));
	pthread_mutex_init(&regex->re_lock, NULL);
	regex->re_cflags = cflags;
	struct re_parse parse;
	memset(&parse, 0, sizeof(parse));
	parse.subexpr_num = 1;
	int ret = re_parse(&parse, &regex->re, pattern, cflags);
	while ( parse.subexpr )
	{
		struct re_parse_subexpr* todelete = parse.subexpr;
		parse.subexpr = todelete->next;
		free(todelete);
	}
	if ( ret != 0 )
		return regfree(regex), ret;
	size_t state_count = 0;
	if ( !re_transform(&regex->re, &state_count) )
		return regfree(regex), REG_ESPACE;
	size_t matches_length;
	if ( __builtin_mul_overflow(parse.subexpr_num, state_count, &matches_length) )
		return regfree(regex), REG_ESPACE;
	regex->re_matches = (regmatch_t*)
		reallocarray(NULL, matches_length, sizeof(regmatch_t));
	if ( !regex->re_matches )
		return regfree(regex), REG_ESPACE;
	size_t state_recount = 0;
	re_control_flow(regex->re, regex->re_matches, parse.subexpr_num, &state_recount);
	assert(state_count == state_recount);
	if ( !(cflags & REG_NOSUB) )
		regex->re_nsub = parse.subexpr_num - 1;
	return ret;
}