Add regex(3).

2015-12-15 23:43:34 +01:00 · 2015-12-15 23:43:34 +01:00 · 75808c637d
parent 5a3e181975
commit 75808c637d
7 changed files with 1338 additions and 6 deletions
--- a/libc/Makefile
+++ b/libc/Makefile
@ -62,6 +62,10 @@ malloc/heap_init.o \
 malloc/__heap_lock.o \
 malloc/__heap_unlock.o \
 malloc/__heap_verify.o \
+regex/regcomp.o \
+regex/regerror.o \
+regex/regexec.o \
+regex/regfree.o \
 signal/sigaddset.o \
 signal/sigandset.o \
 signal/sigdelset.o \
--- a/libc/include/regex.h
+++ b/libc/include/regex.h
@ -0,0 +1,177 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    regex.h
+    Regular expressions.
+
+*******************************************************************************/
+
+#ifndef _REGEX_H
+#define _REGEX_H
+
+#include <sys/cdefs.h>
+
+#include <sys/__/types.h>
+
+#if defined(__is_sortix_libc)
+#include <pthread.h>
+#else
+#include <__/pthread.h>
+#endif
+
+#ifndef __size_t_defined
+#define __size_t_defined
+#define __need_size_t
+#include <stddef.h>
+#endif
+
+typedef __ssize_t regoff_t;
+
+typedef struct
+{
+	regoff_t rm_so;
+	regoff_t rm_eo;
+} regmatch_t;
+
+#if defined(__is_sortix_libc)
+enum re_type
+{
+	RE_TYPE_BOL,
+	RE_TYPE_EOL,
+	RE_TYPE_CHAR,
+	RE_TYPE_ANY_CHAR,
+	RE_TYPE_SET,
+	RE_TYPE_SUBEXPRESSION,
+	RE_TYPE_SUBEXPRESSION_END,
+	RE_TYPE_ALTERNATIVE,
+	RE_TYPE_OPTIONAL,
+	RE_TYPE_LOOP,
+	RE_TYPE_REPETITION,
+	/* TODO: Back-references. */
+};
+
+struct re;
+
+struct re_char
+{
+	char c;
+};
+
+struct re_set
+{
+	unsigned char set[32];
+};
+
+struct re_subexpression
+{
+	struct re* re_owner;
+	size_t index;
+};
+
+struct re_split
+{
+	struct re* re;
+	struct re* re_owner;
+};
+
+struct re_repetition
+{
+       struct re* re;
+       size_t min;
+       size_t max;
+};
+
+struct re
+{
+	enum re_type re_type;
+	union
+	{
+		struct re_char re_char;
+		struct re_set re_set;
+		struct re_subexpression re_subexpression;
+		struct re_split re_split;
+		struct re_repetition re_repetition;
+	};
+	struct re* re_next;
+	struct re* re_next_owner;
+	struct re* re_current_state_prev;
+	struct re* re_current_state_next;
+	struct re* re_upcoming_state_next;
+	unsigned char re_is_currently_done;
+	unsigned char re_is_current;
+	unsigned char re_is_upcoming;
+	regmatch_t* re_matches;
+};
+#endif
+
+typedef struct
+{
+	size_t re_nsub;
+#if defined(__is_sortix_libc)
+	pthread_mutex_t re_lock;
+	struct re* re;
+	regmatch_t* re_matches;
+	size_t re_state_count;
+	int re_cflags;
+#else
+	__pthread_mutex_t __re_lock;
+	void* __re;
+	regmatch_t* __re_matches;
+	size_t __re_state_count;
+	int __re_cflags;
+#endif
+} regex_t;
+
+#define REG_EXTENDED (1 << 0)
+#define REG_ICASE (1 << 1)
+#define REG_NOSUB (1 << 2)
+#define REG_NEWLINE (1 << 3)
+
+#define REG_NOTBOL (1 << 0)
+#define REG_NOTEOL (1 << 1)
+
+#define REG_NOMATCH 1
+#define REG_BADPAT 2
+#define REG_ECOLLATE 3
+#define REG_ECTYPE 4
+#define REG_EESCAPE 5
+#define REG_ESUBREG 6
+#define REG_EBRACK 7
+#define REG_EPAREN 8
+#define REG_EBRACE 9
+#define REG_BADBR 10
+#define REG_ERANGE 11
+#define REG_ESPACE 12
+#define REG_BADRPT 13
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int regcomp(regex_t* __restrict, const char* __restrict, int);
+size_t regerror(int, const regex_t* __restrict, char* __restrict, size_t);
+int regexec(const regex_t* __restrict, const char* __restrict, size_t,
+            regmatch_t* __restrict, int);
+void regfree(regex_t*);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
--- a/libc/regex/regcomp.cpp
+++ b/libc/regex/regcomp.cpp
@ -0,0 +1,727 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    regex/regcomp.cpp
+    Regular expression compiler.
+
+*******************************************************************************/
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct re_parse_subexpr
+{
+	struct re_parse_subexpr* next;
+	struct re** prev_next_ptr;
+	struct re** primary_next_ptr;
+};
+
+struct re_parse
+{
+	struct re_parse_subexpr* subexpr;
+	size_t subexpr_num;
+};
+
+static inline bool re_basic_well_defined_escape(char c)
+{
+	return c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' ||
+	       c == '.' || c == '*' || c == '[' || c == ']' || c == '^' ||
+	       c == '$' || c == '+' || c == '?' || c == '|' ||
+	       ('0' <= c && c <= '9');
+}
+
+static inline bool re_extended_well_defined_escape(char c)
+{
+	return c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' ||
+	       c == '.' || c == '*' || c == '[' || c == ']' || c == '^' ||
+	       c == '$' || c == '+' || c == '?' || c == '|';
+}
+
+static inline void re_free(struct re* re)
+{
+	regex_t regex;
+	memset(&regex, 0, sizeof(regex));
+	pthread_mutex_init(&regex.re_lock, NULL);
+	regex.re = re;
+	regfree(&regex);
+}
+
+static inline int re_parse(struct re_parse* parse,
+                           struct re** restrict prev_next_ptr,
+                           const char* restrict pattern,
+                           int cflags)
+{
+	*prev_next_ptr = NULL;
+
+	bool is_extended = cflags & REG_EXTENDED;
+	bool is_basic = !is_extended;
+
+	struct re** primary_next_ptr = prev_next_ptr;
+	struct re* re;
+
+	size_t pattern_index = 0;
+	//size_t alternative_begun_at = pattern_index;
+	while ( true )
+	{
+		size_t c_pattern_index = pattern_index++;
+		char c = pattern[c_pattern_index];
+
+		if ( c == '\0' )
+		{
+			if ( parse->subexpr )
+				return REG_EPAREN;
+			return 0;
+		}
+
+		bool escaped = false;
+		if ( c == '\\' )
+		{
+			c_pattern_index = pattern_index++;
+			c = pattern[c_pattern_index];
+			if ( c == '\0' )
+				return REG_BADPAT;
+			if ( is_basic && !re_basic_well_defined_escape(c) )
+				return REG_BADPAT;
+			if ( is_extended && !re_extended_well_defined_escape(c) )
+				return REG_BADPAT;
+			escaped = true;
+		}
+
+		bool escaped_for_basic = (is_basic && escaped) ||
+		                         (is_extended && !escaped);
+
+		if ( escaped_for_basic && c == ')' )
+		{
+			struct re_parse_subexpr* subexpr = parse->subexpr;
+			if ( !subexpr )
+				return REG_EPAREN;
+			*prev_next_ptr = NULL;
+			prev_next_ptr = subexpr->prev_next_ptr;
+			primary_next_ptr = subexpr->primary_next_ptr;
+			//alternative_begun_at = subexpr->alternative_begun_at;
+			parse->subexpr = subexpr->next;
+			free(subexpr);
+			re = *prev_next_ptr;
+			goto subexpression_done;
+		}
+
+		// TODO: Properly reject anchors in the basic regular expression cases
+		//       where they aren't appropriate. Mind that we implement the
+		//       extension where all ERE features are available in BRE mode if
+		//       accessed through backslashes.
+		//if ( !escaped && c == '^' &&
+		//     (0 < parse->subexpr_depth || c_pattern_index != alternative_begun_at) )
+		//	return REG_BADRPT;
+		//if ( !escaped && c == '$' &&
+		//     (0 < parse->subexpr_depth || pattern[pattern_index] != '0') )
+		//	return REG_BADRPT;
+		if ( !escaped && c == '*' )
+			return REG_BADRPT;
+		if ( escaped_for_basic && c == '{' )
+			return REG_BADBR;
+		if ( (is_basic && escaped && c == '+') ||
+		     (is_extended && !escaped && c == '+') )
+			return REG_BADBR;
+		if ( (is_basic && escaped && c == '?') ||
+		     (is_extended && !escaped && c == '?') )
+			return REG_BADBR;
+
+		if ( !(re = (struct re*) calloc(1, sizeof(struct re))) )
+			return REG_ESPACE;
+
+		if ( escaped_for_basic && c == '|' )
+		{
+			re->re_type = RE_TYPE_ALTERNATIVE;
+			re->re_next_owner = *primary_next_ptr;
+			re->re_split.re_owner = NULL;
+			*primary_next_ptr = re;
+			prev_next_ptr = primary_next_ptr = &re->re_split.re_owner;
+			continue;
+		}
+		// TODO: Check if this anchor logic is the right one. This uses them as
+		//       special characters in BRE mode in cases they shouldn't be.
+		else if ( !escaped && c == '^' )
+		{
+			re->re_type = RE_TYPE_BOL;
+			*prev_next_ptr = re;
+			prev_next_ptr = &re->re_next_owner;
+			continue;
+		}
+		else if ( !escaped && c == '$' )
+		{
+			re->re_type = RE_TYPE_EOL;
+			*prev_next_ptr = re;
+			prev_next_ptr = &re->re_next_owner;
+			continue;
+		}
+		else if ( escaped_for_basic && c == '(' )
+		{
+			re->re_type = RE_TYPE_SUBEXPRESSION;
+			re->re_subexpression.index = parse->subexpr_num++;
+			re->re_subexpression.re_owner = NULL;
+			*prev_next_ptr = re;
+			struct re* end = (struct re*) calloc(1, sizeof(struct re));
+			if ( !end )
+				return REG_ESPACE;
+			end->re_type = RE_TYPE_SUBEXPRESSION_END;
+			end->re_subexpression.index = re->re_subexpression.index;
+			re->re_next_owner = end;
+			struct re_parse_subexpr* subexpr = (struct re_parse_subexpr*)
+				calloc(sizeof(struct re_parse_subexpr), 1);
+			if ( !subexpr )
+				return REG_ESPACE;
+			subexpr->prev_next_ptr = prev_next_ptr;
+			subexpr->primary_next_ptr = primary_next_ptr;
+			//subexpr->alternative_begun_at = alternative_begun_at;
+			subexpr->next = parse->subexpr;
+			parse->subexpr = subexpr;
+			prev_next_ptr = &re->re_subexpression.re_owner;
+			primary_next_ptr = &re->re_subexpression.re_owner;
+			//alternative_begun_at = pattern_index;
+			continue;
+		}
+		// TODO: This is not properly implemented.
+		// TODO: This is not properly unicode-aware.
+		else if ( c == '[' )
+		{
+			re->re_type = RE_TYPE_SET;
+			bool negate = false;
+			if ( pattern[pattern_index] == '^' )
+			{
+				pattern_index += 1;
+				negate = true;
+			}
+			while ( pattern[pattern_index] != ']' )
+			{
+				if ( pattern[pattern_index] == '\0' )
+					return free(re), REG_EBRACK;
+				// TODO: This is wrong and fragile.
+				unsigned char c_from;
+				unsigned char c_to;
+				if ( pattern[pattern_index + 1] == '-' )
+				{
+					c_from = (unsigned char) pattern[pattern_index + 0];
+					c_to = (unsigned char) pattern[pattern_index + 2];
+					pattern_index += 3;
+				}
+				else
+				{
+					c_from = (unsigned char) pattern[pattern_index + 0];
+					c_to = (unsigned char) pattern[pattern_index + 0];
+					pattern_index += 1;
+				}
+				for ( unsigned int uc = c_from; uc <= c_to; uc++ )
+				{
+					size_t byte_index = uc / 8;
+					size_t bit_index = uc % 8;
+					re->re_set.set[byte_index] |= (1 << bit_index);
+				}
+			}
+			if ( negate )
+			{
+				for ( size_t i = 0; i < 32; i++ )
+					re->re_set.set[i] = ~re->re_set.set[i];
+			}
+			if ( pattern[pattern_index++] != ']' )
+				return free(re), REG_EBRACK;
+		}
+		else if ( escaped && ('0' <= c && c <= '9') )
+		{
+			// TODO: This isn't implemented yet (not part of ERE).
+			return free(re), REG_BADPAT;
+		}
+		else if ( !escaped && c == '.' )
+			re->re_type = RE_TYPE_ANY_CHAR;
+		else
+		{
+			re->re_type = RE_TYPE_CHAR;
+			re->re_char.c = c;
+		}
+
+		*prev_next_ptr = re;
+
+subexpression_done:
+		if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
+		                  pattern[pattern_index + 1] == '{') ||
+		     (is_extended && pattern[pattern_index] == '{' ) )
+		{
+			pattern_index += is_extended ? 1 : 2;
+			if ( pattern[pattern_index] < '0' ||
+			     pattern[pattern_index] > '9' )
+				return REG_BADBR;
+			uintmax_t repeat_min;
+			uintmax_t repeat_max;
+			const char* value;
+			const char* value_end;
+			int saved_errno = errno;
+			value = (char*) (pattern + pattern_index);
+			repeat_min = strtoumax((char*) value, (char**) &value_end, 10);
+			int parse_errno = errno;
+			errno = saved_errno;
+			if ( parse_errno == ERANGE || SIZE_MAX < repeat_min )
+				return REG_BADBR;
+			pattern_index += value_end - value;
+			if ( pattern[pattern_index] == ',' )
+			{
+				repeat_max = SIZE_MAX;
+				pattern_index += 1;
+				if ( pattern[pattern_index] >= '0' &&
+				     pattern[pattern_index] <= '9' )
+				{
+					saved_errno = errno;
+					value = (char*) (pattern + pattern_index);
+					repeat_max = strtoumax((char*) value, (char**) &value_end, 10);
+					parse_errno = errno;
+					errno = saved_errno;
+					if ( parse_errno == ERANGE || SIZE_MAX < repeat_max )
+						return  REG_BADBR;
+					if ( repeat_max < repeat_min )
+						return REG_BADBR;
+					pattern_index += value_end - value;
+				}
+			}
+			else
+			{
+				repeat_max = repeat_min;
+			}
+			if ( (is_basic && pattern[pattern_index++] != '\\') ||
+			     pattern[pattern_index++] != '}' )
+				return REG_BADBR;
+			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
+			if ( !re_repetition )
+				return REG_ESPACE;
+			re_repetition->re_type = RE_TYPE_REPETITION;
+			re_repetition->re_repetition.re = re;
+			re_repetition->re_repetition.min = (size_t) repeat_min;
+			re_repetition->re_repetition.max = (size_t) repeat_max;
+			*prev_next_ptr = re_repetition;
+			re = re_repetition;
+		}
+		else if ( pattern[pattern_index] == '*' )
+		{
+			pattern_index += 1;
+			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
+			if ( !re_repetition )
+				return REG_ESPACE;
+			re_repetition->re_type = RE_TYPE_REPETITION;
+			re_repetition->re_repetition.re = re;
+			re_repetition->re_repetition.min = 0;
+			re_repetition->re_repetition.max = SIZE_MAX;
+			*prev_next_ptr = re_repetition;
+			re = re_repetition;
+		}
+		else if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
+		                       pattern[pattern_index + 1] == '?') ||
+		          (is_extended && pattern[pattern_index] == '?' ) )
+		{
+			pattern_index += is_extended ? 1 : 2;
+			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
+			if ( !re_repetition )
+				return REG_ESPACE;
+			re_repetition->re_type = RE_TYPE_REPETITION;
+			re_repetition->re_repetition.re = re;
+			re_repetition->re_repetition.min = 0;
+			re_repetition->re_repetition.max = 1;
+			*prev_next_ptr = re_repetition;
+			re = re_repetition;
+		}
+		else if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
+		                       pattern[pattern_index + 1] == '+') ||
+		          (is_extended && pattern[pattern_index] == '+' ) )
+		{
+			pattern_index += is_extended ? 1 : 2;
+			struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
+			if ( !re_repetition )
+				return REG_ESPACE;
+			re_repetition->re_type = RE_TYPE_REPETITION;
+			re_repetition->re_repetition.re = re;
+			re_repetition->re_repetition.min = 1;
+			re_repetition->re_repetition.max = SIZE_MAX;
+			*prev_next_ptr = re_repetition;
+			re = re_repetition;
+		}
+
+		if ( re->re_type == RE_TYPE_SUBEXPRESSION )
+			re = re->re_next_owner; // RE_TYPE_SUBEXPRESSION_END.
+
+		prev_next_ptr = &re->re_next_owner;
+	}
+}
+
+static inline bool re_duplicate(struct re* templ, struct re** re_ptr)
+{
+	struct re* copy;
+	struct re* parent_templ = NULL;
+	struct re* parent_copy = NULL;
+	while ( true )
+	{
+		if ( !templ )
+		{
+			if ( parent_templ )
+			{
+				templ = parent_templ;
+				copy = parent_copy;
+				parent_templ = templ->re_upcoming_state_next;
+				parent_copy = copy->re_upcoming_state_next;
+				templ = templ->re_next_owner;
+				re_ptr = &copy->re_next_owner;
+				continue;
+			}
+			return *re_ptr = NULL, true;
+		}
+		if ( !(copy = (struct re*) calloc(1, sizeof(struct re))) )
+			return false;
+		*re_ptr = copy;
+		copy->re_type = templ->re_type;
+		if ( templ->re_type == RE_TYPE_BOL )
+			;
+		else if ( templ->re_type == RE_TYPE_BOL )
+			;
+		else if ( templ->re_type == RE_TYPE_CHAR )
+			copy->re_char.c = templ->re_char.c;
+		else if ( templ->re_type == RE_TYPE_ANY_CHAR )
+			;
+		else if ( templ->re_type == RE_TYPE_SET )
+			memcpy(copy->re_set.set, templ->re_set.set, 32);
+		else if ( templ->re_type == RE_TYPE_SUBEXPRESSION )
+		{
+			copy->re_subexpression.index = templ->re_subexpression.index;
+			templ->re_upcoming_state_next = parent_templ;
+			copy->re_upcoming_state_next = parent_copy;
+			parent_templ = templ;
+			parent_copy = copy;
+			templ = templ->re_subexpression.re_owner;
+			re_ptr = &copy->re_subexpression.re_owner;
+			continue;
+		}
+		else if ( templ->re_type == RE_TYPE_SUBEXPRESSION_END )
+			copy->re_subexpression.index = templ->re_subexpression.index;
+		else if ( templ->re_type == RE_TYPE_ALTERNATIVE ||
+			      templ->re_type == RE_TYPE_OPTIONAL ||
+			      templ->re_type == RE_TYPE_LOOP )
+		{
+			templ->re_upcoming_state_next = parent_templ;
+			copy->re_upcoming_state_next = parent_copy;
+			parent_templ = templ;
+			parent_copy = copy;
+			templ = templ->re_split.re_owner;
+			re_ptr = &copy->re_split.re_owner;
+			continue;
+		}
+		else if ( templ->re_type == RE_TYPE_REPETITION )
+		{
+			copy->re_repetition.min = templ->re_repetition.min;
+			copy->re_repetition.max = templ->re_repetition.max;
+			templ->re_upcoming_state_next = parent_templ;
+			copy->re_upcoming_state_next = parent_copy;
+			parent_templ = templ;
+			parent_copy = copy;
+			templ = templ->re_split.re;
+			re_ptr = &copy->re_split.re;
+			continue;
+		}
+		else
+			assert(false);
+		templ = templ->re_next_owner;
+		re_ptr = &copy->re_next_owner;
+	}
+}
+
+static inline bool re_repetition(struct re* templ,
+                                 struct re** re_ptr,
+                                 size_t min,
+                                 size_t max,
+                                 struct re* after)
+{
+	while ( true )
+	{
+		if ( !max )
+			return *re_ptr = after, true;
+		struct re* copy = (struct re*) calloc(1, sizeof(struct re));
+		if ( !copy )
+			return false;
+		*re_ptr = copy;
+		copy->re_type = templ->re_type;
+		if ( templ->re_type == RE_TYPE_BOL )
+			;
+		else if ( templ->re_type == RE_TYPE_BOL )
+			;
+		else if ( templ->re_type == RE_TYPE_CHAR )
+			copy->re_char.c = templ->re_char.c;
+		else if ( templ->re_type == RE_TYPE_ANY_CHAR )
+			;
+		else if ( templ->re_type == RE_TYPE_SET )
+			memcpy(copy->re_set.set, templ->re_set.set, 32);
+		else if ( templ->re_type == RE_TYPE_SUBEXPRESSION )
+		{
+			copy->re_subexpression.index = templ->re_subexpression.index;
+			if ( !re_duplicate(templ->re_subexpression.re_owner,
+				              &copy->re_subexpression.re_owner) )
+				return false;
+			struct re* templ_end = templ->re_next_owner;
+			assert(templ_end && templ_end->re_type == RE_TYPE_SUBEXPRESSION_END);
+			struct re* end = (struct re*) calloc(1, sizeof(struct re));
+			if ( !end )
+				return false;
+			end->re_type = RE_TYPE_SUBEXPRESSION_END;
+			end->re_subexpression.index = templ_end->re_subexpression.index;
+			copy->re_next_owner = end;
+		}
+		else
+			assert(false);
+		if ( 1 <= min )
+		{
+			while ( copy->re_next_owner )
+				copy = copy->re_next_owner;
+			re_ptr = &copy->re_next_owner;
+			if ( max != SIZE_MAX )
+				max--;
+			min--;
+		}
+		else if ( max < SIZE_MAX )
+		{
+			struct re* wrap = (struct re*) calloc(1, sizeof(struct re));
+			if ( !wrap )
+				return false;
+			wrap->re_type = RE_TYPE_OPTIONAL;
+			wrap->re_split.re_owner = copy;
+			*re_ptr = wrap;
+			re_ptr = &wrap->re_next_owner;
+			max--;
+		}
+		else
+		{
+			struct re* wrap = (struct re*) calloc(1, sizeof(struct re));
+			if ( !wrap )
+				return false;
+			wrap->re_type = RE_TYPE_LOOP;
+			wrap->re_split.re_owner = copy;
+			*re_ptr = wrap;
+			re_ptr = &wrap->re_next_owner;
+			max = 0;
+		}
+	}
+}
+
+static inline bool re_transform(struct re** re_ptr, size_t* state_count_ptr)
+{
+	if ( !*re_ptr )
+	{
+		struct re* re;
+		if ( !(re = (struct re*) calloc(1, sizeof(struct re))) )
+			return false;
+		re->re_type = RE_TYPE_BOL;
+		*re_ptr = re;
+	}
+
+	struct re** parent_ptr = NULL;
+	while ( *re_ptr )
+	{
+		struct re* re = *re_ptr;
+
+		if ( re->re_type == RE_TYPE_REPETITION )
+		{
+			struct re* templ = re->re_repetition.re;
+			size_t min = re->re_repetition.min;
+			size_t max = re->re_repetition.max;
+			struct re* after = re->re_next_owner;
+			struct re* replacement = NULL;
+			re->re_next_owner = NULL;
+			re_repetition(templ, &replacement, min, max, after);
+			re_free(re);
+			*re_ptr = re = replacement;
+			continue;
+		}
+
+		(*state_count_ptr)++;
+
+		if ( re->re_type == RE_TYPE_SUBEXPRESSION &&
+		     re->re_subexpression.re_owner )
+		{
+			re->re_current_state_prev = (struct re*) parent_ptr;
+			parent_ptr = re_ptr;
+			re_ptr = &re->re_subexpression.re_owner;
+			continue;
+		}
+
+		if ( (re->re_type == RE_TYPE_ALTERNATIVE ||
+		      re->re_type == RE_TYPE_OPTIONAL ||
+		      re->re_type == RE_TYPE_LOOP) && re->re_split.re_owner )
+		{
+			re->re_current_state_prev = (struct re*) parent_ptr;
+			parent_ptr = re_ptr;
+			re_ptr = &re->re_split.re_owner;
+			continue;
+		}
+
+		re_ptr = &re->re_next_owner;
+		while ( !*re_ptr && parent_ptr )
+		{
+			re_ptr = parent_ptr;
+			parent_ptr = (struct re**) (*re_ptr)->re_current_state_prev;
+			re_ptr = &(*re_ptr)->re_next_owner;
+		}
+	}
+
+	return true;
+}
+
+static inline void re_control_flow(struct re* re,
+                                   regmatch_t* matches,
+                                   size_t matches_per_state,
+                                   size_t* state_count_ptr)
+{
+	struct re* parent = NULL;
+	struct re* parent_link = NULL;
+	while ( re )
+	{
+		size_t re_index = (*state_count_ptr)++;
+		size_t offset = re_index * matches_per_state;
+		re->re_matches = matches + offset;
+
+		if ( re->re_type == RE_TYPE_ALTERNATIVE )
+		{
+			if ( !re->re_split.re_owner )
+				re->re_split.re = parent_link;
+			if ( !re->re_next_owner )
+				re->re_next = parent_link;
+			if ( re->re_split.re_owner && re->re_next_owner )
+			{
+				re->re_next = re->re_next_owner;
+				re->re_current_state_prev = parent;
+				re->re_current_state_next = parent_link;
+				re->re_upcoming_state_next = re->re_next_owner;
+				parent = re;
+				re = re->re_split.re = re->re_split.re_owner;
+			}
+			else if ( re->re_split.re_owner )
+				re = re->re_split.re = re->re_split.re_owner;
+			else if ( re->re_next_owner )
+				re = re->re_next = re->re_next_owner;
+			else if ( parent )
+			{
+				re = parent;
+				parent = re->re_current_state_prev;
+				parent_link = re->re_current_state_next;
+				re = re->re_upcoming_state_next;
+			}
+			else
+				re = NULL;
+			continue;
+		}
+
+		if ( !re->re_next_owner && parent_link )
+			re->re_next = parent_link;
+		else
+			re->re_next = re->re_next_owner;
+
+		if ( re->re_type == RE_TYPE_LOOP || re->re_type == RE_TYPE_OPTIONAL )
+		{
+			struct re* inner = re->re_split.re_owner;
+			struct re* after = re->re_next;
+			re->re_split.re = after;
+			re->re_next = inner;
+			if ( re->re_next_owner )
+			{
+				re->re_current_state_prev = parent;
+				re->re_current_state_next = parent_link;
+				re->re_upcoming_state_next = after;
+				parent = re;
+			}
+			if ( re->re_type == RE_TYPE_LOOP )
+				parent_link = re;
+			else
+				parent_link = after;
+			re = inner;
+			continue;
+		}
+
+		if ( re->re_type == RE_TYPE_SUBEXPRESSION )
+		{
+			if ( re->re_subexpression.re_owner )
+			{
+				re->re_current_state_prev = parent;
+				re->re_current_state_next = parent_link;
+				re->re_upcoming_state_next = re->re_next_owner;
+				parent = re;
+				parent_link = re->re_next;
+				re->re_next = re->re_subexpression.re_owner;
+				re = re->re_subexpression.re_owner;
+				continue;
+			}
+		}
+
+		if ( !re->re_next_owner && parent )
+		{
+			re = parent;
+			parent = re->re_current_state_prev;
+			parent_link = re->re_current_state_next;
+		}
+
+		re = re->re_next_owner;
+	}
+}
+
+extern "C"
+int regcomp(regex_t* restrict regex,
+            const char* restrict pattern,
+            int cflags)
+{
+	// TODO: Verify cflags.
+	// TODO: Implement REG_ICASE.
+	// TODO: Implement REG_NOSUB.
+	// TODO: Implement REG_NEWLINE.
+	memset(regex, 0, sizeof(*regex));
+	pthread_mutex_init(&regex->re_lock, NULL);
+	regex->re_cflags = cflags;
+	struct re_parse parse;
+	memset(&parse, 0, sizeof(parse));
+	parse.subexpr_num = 1;
+	int ret = re_parse(&parse, &regex->re, pattern, cflags);
+	while ( parse.subexpr )
+	{
+		struct re_parse_subexpr* todelete = parse.subexpr;
+		parse.subexpr = todelete->next;
+		free(todelete);
+	}
+	if ( ret != 0 )
+		return regfree(regex), ret;
+	size_t state_count = 0;
+	if ( !re_transform(&regex->re, &state_count) )
+		return regfree(regex), REG_ESPACE;
+	size_t matches_length;
+	if ( __builtin_mul_overflow(parse.subexpr_num, state_count, &matches_length) )
+		return regfree(regex), REG_ESPACE;
+	regex->re_matches = (regmatch_t*)
+		reallocarray(NULL, matches_length, sizeof(regmatch_t));
+	if ( !regex->re_matches )
+		return regfree(regex), REG_ESPACE;
+	size_t state_recount = 0;
+	re_control_flow(regex->re, regex->re_matches, parse.subexpr_num, &state_recount);
+	assert(state_count == state_recount);
+	if ( !(cflags & REG_NOSUB) )
+		regex->re_nsub = parse.subexpr_num - 1;
+	return ret;
+}
--- a/libc/regex/regerror.cpp
+++ b/libc/regex/regerror.cpp
@ -0,0 +1,56 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    regex/regerror.cpp
+    Regular expression error reporting.
+
+*******************************************************************************/
+
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+
+extern "C"
+size_t regerror(int errnum,
+                const regex_t* restrict regex,
+                char* restrict errbuf,
+                size_t errbuf_size)
+{
+	(void) regex;
+	const char* msg = "Unknown regular expression error";
+	switch ( errnum )
+	{
+	case REG_NOMATCH: msg = "Regular expression does not match"; break;
+	case REG_BADPAT: msg = "Invalid regular expression"; break;
+	case REG_ECOLLATE: msg = "Invalid collating element referenced"; break;
+	case REG_ECTYPE: msg = "Invalid character class type referenced"; break;
+	case REG_EESCAPE: msg = "Trailing <backslash> character in pattern"; break;
+	case REG_ESUBREG: msg = "Number in \\digit invalid or in error"; break;
+	case REG_EBRACK: msg = "\"[]\" imbalance"; break;
+	case REG_EPAREN: msg = "\"\\(\\)\" or \"()\" imbalance"; break;
+	case REG_EBRACE: msg = "\"\\{\\}\" imbalance"; break;
+	case REG_BADBR: msg = "Content of \"\\{\\}\" invalid: not a number, number too large, more than two numbers, first larger than second"; break;
+	case REG_ERANGE: msg = "Invalid endpoint in range expression"; break;
+	case REG_ESPACE: msg = "Out of memory"; break;
+	case REG_BADRPT: msg = "'?', '*', or '+' not preceded by valid regular expression"; break;
+	}
+	if ( errbuf_size )
+		strlcpy(errbuf, msg, errbuf_size);
+	return strlen(msg) + 1;
+}
--- a/libc/regex/regexec.cpp
+++ b/libc/regex/regexec.cpp
@ -0,0 +1,253 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    regex/regexec.cpp
+    Regular expression execution.
+
+*******************************************************************************/
+
+#include <assert.h>
+#include <regex.h>
+#include <pthread.h>
+
+#define QUEUE_CURRENT_STATE(new_state) \
+{ \
+	if ( !new_state ) \
+	{ \
+		match = true; \
+		for ( struct re* re = state->re_current_state_next; \
+		      re; \
+		      re = re->re_current_state_next ) \
+			re->re_is_current = 0; \
+		state->re_current_state_next = NULL; \
+		current_states_last = state; \
+	} \
+	else if ( !(new_state->re_is_current && new_state->re_is_currently_done) ) \
+	{ \
+		if ( new_state->re_is_current ) \
+		{ \
+			if ( new_state->re_current_state_prev ) \
+				new_state->re_current_state_prev->re_current_state_next = \
+					new_state->re_current_state_next; \
+			else \
+				current_states = new_state->re_current_state_next; \
+			if ( new_state->re_current_state_next ) \
+				new_state->re_current_state_next->re_current_state_prev = \
+					new_state->re_current_state_prev; \
+			else \
+				current_states_last = new_state->re_current_state_prev; \
+		} \
+		new_state->re_current_state_prev = state; \
+		new_state->re_current_state_next = state->re_current_state_next; \
+		if ( state->re_current_state_next ) \
+			state->re_current_state_next->re_current_state_prev = new_state; \
+		else \
+			current_states_last = new_state; \
+		state->re_current_state_next = new_state; \
+		new_state->re_is_currently_done = 0; \
+		new_state->re_is_current = 1; \
+		new_state->re_is_upcoming = 0; \
+		for ( size_t m = 0; m < nmatch; m++ ) \
+			new_state->re_matches[m] = state->re_matches[m]; \
+	} \
+} \
+
+#define QUEUE_UPCOMING_STATE(new_state) \
+{ \
+	if ( !new_state ) \
+	{ \
+		consumed_char = true; \
+		match = true; \
+		for ( struct re* re = state->re_current_state_next; \
+		      re; \
+		      re = re->re_current_state_next ) \
+			re->re_is_current = 0; \
+		state->re_current_state_next = NULL; \
+		current_states_last = state; \
+	} \
+	else if ( !new_state->re_is_upcoming ) \
+	{ \
+		if ( !upcoming_states ) \
+			upcoming_states = new_state; \
+		if ( upcoming_states_last ) \
+			upcoming_states_last->re_upcoming_state_next = new_state; \
+		upcoming_states_last = new_state; \
+		new_state->re_upcoming_state_next = NULL; \
+		new_state->re_is_upcoming = 1; \
+		for ( size_t m = 0; m < nmatch; m++ ) \
+			new_state->re_matches[m] = state->re_matches[m]; \
+	} \
+} \
+
+extern "C"
+int regexec(const regex_t* restrict regex_const,
+            const char* restrict string,
+            size_t nmatch,
+            regmatch_t* restrict pmatch,
+            int eflags)
+{
+	// TODO: Sanitize eflags.
+
+	regex_t* regex = (regex_t*) regex_const;
+	pthread_mutex_lock(&regex->re_lock);
+
+	if ( regex->re_cflags & REG_NOSUB )
+		nmatch = 0;
+
+	for ( size_t i = 0; i < nmatch; i++ )
+	{
+		pmatch[i].rm_so = -1;
+		pmatch[i].rm_eo = -1;
+	}
+
+	if ( regex->re_nsub + 1 < nmatch )
+		nmatch = regex->re_nsub + 1;
+
+	int result = REG_NOMATCH;
+
+	struct re* current_states = NULL;
+	struct re* current_states_last = NULL;
+	struct re* upcoming_states = NULL;
+	struct re* upcoming_states_last = NULL;
+
+	regex->re->re_is_current = 0;
+
+	for ( size_t i = 0; true; i++ )
+	{
+		if ( !regex->re->re_is_current && result == REG_NOMATCH )
+		{
+			if ( current_states_last )
+				current_states_last->re_current_state_next = regex->re;
+			else
+				current_states = regex->re;
+			regex->re->re_current_state_prev = current_states_last;
+			regex->re->re_current_state_next = NULL;
+			current_states_last = regex->re;
+			regex->re->re_is_currently_done = 0;
+			regex->re->re_is_current = 1;
+			regex->re->re_is_upcoming = 0;
+			for ( size_t m = 0; m < nmatch; m++ )
+			{
+				regex->re->re_matches[m].rm_so = m == 0 ? i : -1;
+				regex->re->re_matches[m].rm_eo = -1;
+			}
+		}
+		char c = string[i];
+		for ( struct re* state = current_states;
+		      state;
+		      state = state->re_current_state_next )
+		{
+			bool match = false;
+			bool consumed_char = false;
+			if ( state->re_type == RE_TYPE_BOL )
+			{
+				if ( !(eflags & REG_NOTBOL) )
+					QUEUE_CURRENT_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_EOL )
+			{
+				if ( !(eflags & REG_NOTEOL) && c == '\0' )
+					QUEUE_CURRENT_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_CHAR )
+			{
+				if ( c != '\0' && state->re_char.c == c )
+					QUEUE_UPCOMING_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_ANY_CHAR )
+			{
+				if ( c != '\0' )
+					QUEUE_UPCOMING_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_SET )
+			{
+				unsigned char uc = c;
+				if ( c != '\0' && (state->re_set.set[uc / 8] & (1 << (uc % 8))) )
+					QUEUE_UPCOMING_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_SUBEXPRESSION )
+			{
+				size_t index = state->re_subexpression.index;
+				state->re_matches[index].rm_so = i;
+				QUEUE_CURRENT_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_SUBEXPRESSION_END )
+			{
+				size_t index = state->re_subexpression.index;
+				state->re_matches[index].rm_eo = i;
+				QUEUE_CURRENT_STATE(state->re_next);
+			}
+			else if ( state->re_type == RE_TYPE_ALTERNATIVE ||
+			          state->re_type == RE_TYPE_OPTIONAL ||
+			          state->re_type == RE_TYPE_LOOP )
+			{
+				QUEUE_CURRENT_STATE(state->re_split.re);
+				QUEUE_CURRENT_STATE(state->re_next);
+			}
+			state->re_is_currently_done = 1;
+			if ( match )
+			{
+				state->re_matches[0].rm_eo = i + consumed_char;
+				for ( size_t m = 0; m < nmatch; m++ )
+					pmatch[m] = state->re_matches[m];
+				result = 0;
+				if ( nmatch == 0 )
+					break;
+			}
+		}
+
+		for ( struct re* re = current_states; re; re = re->re_current_state_next )
+			re->re_is_current = 0;
+
+		if ( nmatch == 0 && result == 0 )
+		{
+			for ( struct re* re = upcoming_states; re; re = re->re_upcoming_state_next )
+				re->re_is_upcoming = 0;
+			break;
+		}
+
+		current_states = upcoming_states;
+		if ( current_states )
+			current_states->re_current_state_prev = NULL;
+		current_states_last = upcoming_states_last;
+		for ( struct re* re = current_states; re; re = re->re_current_state_next )
+		{
+			re->re_is_currently_done = 0;
+			re->re_is_current = 1;
+			re->re_is_upcoming = 0;
+			re->re_current_state_next = re->re_upcoming_state_next;
+			if ( re->re_current_state_next )
+				re->re_current_state_next->re_current_state_prev = re;
+		}
+		upcoming_states = NULL;
+		upcoming_states_last = NULL;
+
+		eflags |= REG_NOTBOL;
+
+		if ( current_states == NULL && result == 0 )
+			break;
+
+		if ( c == '\0' )
+			break;
+	}
+
+	pthread_mutex_unlock(&regex->re_lock);
+
+	return result;
+}
--- a/libc/regex/regfree.cpp
+++ b/libc/regex/regfree.cpp
@ -0,0 +1,72 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    regex/regfree.cpp
+    Regular expression freeing.
+
+*******************************************************************************/
+
+#include <regex.h>
+#include <stdlib.h>
+
+extern "C" void regfree(regex_t* regex)
+{
+	struct re* parent = NULL;
+	struct re* re = regex->re;
+	while ( re )
+	{
+		if ( re->re_type == RE_TYPE_SUBEXPRESSION && re->re_subexpression.re_owner )
+		{
+			re->re_next = parent;
+			parent = re;
+			re = parent->re_subexpression.re_owner;
+			parent->re_subexpression.re_owner = NULL;
+			continue;
+		}
+		if ( (re->re_type == RE_TYPE_ALTERNATIVE ||
+		      re->re_type == RE_TYPE_OPTIONAL ||
+		      re->re_type == RE_TYPE_LOOP) &&
+		     re->re_split.re_owner )
+		{
+			re->re_next = parent;
+			parent = re;
+			re = parent->re_split.re_owner;
+			parent->re_split.re_owner = NULL;
+			continue;
+		}
+		if ( re->re_type == RE_TYPE_REPETITION && re->re_repetition.re )
+		{
+			re->re_next = parent;
+			parent = re;
+			re = parent->re_repetition.re;
+			parent->re_repetition.re = NULL;
+			continue;
+		}
+		struct re* todelete = re;
+		re = re->re_next_owner;
+		if ( !re && parent )
+		{
+			re = parent;
+			parent = re->re_next;
+		}
+		free(todelete);
+	}
+	free(regex->re_matches);
+	pthread_mutex_destroy(&regex->re_lock);
+}
--- a/utils/expr.cpp
+++ b/utils/expr.cpp
@ -24,6 +24,7 @@
 #include <error.h>
 #include <inttypes.h>
 #include <locale.h>
+#include <regex.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -43,6 +44,14 @@ char* strdup_or_die(const char* str)
 	return result;
 }

+char* strndup_or_die(const char* str, size_t n)
+{
+	char* result = strndup(str, n);
+	if ( !str )
+		error(2, errno, "strndup");
+	return result;
+}
+
 char* print_intmax_or_die(intmax_t value)
 {
 	char value_string[sizeof(intmax_t) * 3];
@ -282,16 +291,50 @@ char* evaluate_mod(const char* a, const char* b)
 	return evaluate_integer_function(a, b, integer_mod);
 }

-// TODO: Implement regular expression pattern matching!
 char* evaluate_match(const char* a, const char* b)
 {
-	size_t b_length = strlen(b);
-	for ( size_t i = 0; i < b_length; i++ )
+	regex_t regex;
+	int status = regcomp(&regex, b, 0);
+	if ( status != 0 )
 	{
-		if ( b[i] != a[i] )
-			return strdup_or_die("0");
+		char errbuf[256];
+		const char* errmsg = errbuf;
+		char* erralloc = NULL;
+		size_t errbuf_needed;
+		if ( sizeof(errbuf) < (errbuf_needed = regerror(status, &regex, errbuf,
+		                                                sizeof(errbuf))) )
+		{
+			if ( (erralloc = (char*) malloc(errbuf_needed)) )
+			{
+				errmsg = erralloc;
+				regerror(status, &regex, erralloc, errbuf_needed);
 			}
-	return print_intmax_or_die((intmax_t) strlen(a));
+		}
+		error(2, 0, "compiling regular expression: %s", errmsg);
+		free(erralloc);
+	}
+
+	char* result;
+
+	regmatch_t rm[2];
+	if ( regexec(&regex, a, 2, rm, 0) == 0 && rm[0].rm_so == 0 )
+	{
+		if ( 0 <= rm[1].rm_so )
+			result = strndup_or_die(a + rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so);
+		else
+			result = print_intmax_or_die(rm[0].rm_eo);
+	}
+	else
+	{
+		if ( 0 < regex.re_nsub )
+			result = strdup_or_die("");
+		else
+			result = strdup_or_die("0");
+	}
+
+	regfree(&regex);
+
+	return result;
 }

 struct binary_operator