rewrite the regex pattern parser in regcomp

The new code is a bit simpler and the generated code is about 1KB smaller (on i386). The basic design was kept including internal interfaces, TNFA generation was not touched. The old tre parser had various issues: [^aa-z] negated overlapping ranges in a bracket expression were handled incorrectly (eg [^aa-z] was handled as [^a] instead of [^a-z]) a{,2} missing lower bound in a counted repetition should be an error, but it was accepted with broken semantics: a{,2} was treated as a{0,3}, the new parser rejects it a{999,} large min count was not rejected (a{5000,} failed with REG_ESPACE due to reaching a stack limit), the new parser enforces the RE_DUP_MAX limit \xff regcomp used to accept a pattern with illegal sequences in it (treated them as empty expression so p\xffq matched pq) the new parser rejects such patterns with REG_BADPAT or REG_ERANGE [^b-fD-H] with REG_ICASE old parser turned this into [^b-fB-F] because of the negated overlapping range issue (see above), the new parser treats it as [^b-hB-H], POSIX seems to require [^d-fD-F], but practical implementations do case-folding first and negate the character set later instead of the other way around. (Supporting the posix way efficiently would require significant changes so it was left as is, it is unclear if any application actually expects the posix behaviour, this issue is raised on the austingroup tracker: http://austingroupbugs.net/view.php?id=872 ). another case-insensitive matching issue is that unicode case folding rules can group more than two characters together while towupper and towlower can only work for a pair of upper and lower case characters, this is a limitation of POSIX so it is not fixed. invalid bracket and brace expressions may return different error codes now (REG_ERANGE instead of REG_EBRACK or REG_BADBR instead of REG_EBRACE) otherwise the new parser should be compatible with the old one. regcomp should be able to handle arbitrary pattern input if the pattern length is limited, the only exception is the use of large repetition counts (eg. (a{255}){255}) which require exp amount of memory and there is no easy workaround.

rewrite the regex pattern parser in regcomp
The new code is a bit simpler and the generated code is about 1KB smaller (on i386). The basic design was kept including internal interfaces, TNFA generation was not touched. The old tre parser had various issues: [^aa-z] negated overlapping ranges in a bracket expression were handled incorrectly (eg [^aa-z] was handled as [^a] instead of [^a-z]) a{,2} missing lower bound in a counted repetition should be an error, but it was accepted with broken semantics: a{,2} was treated as a{0,3}, the new parser rejects it a{999,} large min count was not rejected (a{5000,} failed with REG_ESPACE due to reaching a stack limit), the new parser enforces the RE_DUP_MAX limit \xff regcomp used to accept a pattern with illegal sequences in it (treated them as empty expression so p\xffq matched pq) the new parser rejects such patterns with REG_BADPAT or REG_ERANGE [^b-fD-H] with REG_ICASE old parser turned this into [^b-fB-F] because of the negated overlapping range issue (see above), the new parser treats it as [^b-hB-H], POSIX seems to require [^d-fD-F], but practical implementations do case-folding first and negate the character set later instead of the other way around. (Supporting the posix way efficiently would require significant changes so it was left as is, it is unclear if any application actually expects the posix behaviour, this issue is raised on the austingroup tracker: http://austingroupbugs.net/view.php?id=872 ). another case-insensitive matching issue is that unicode case folding rules can group more than two characters together while towupper and towlower can only work for a pair of upper and lower case characters, this is a limitation of POSIX so it is not fixed. invalid bracket and brace expressions may return different error codes now (REG_ERANGE instead of REG_EBRACK or REG_BADBR instead of REG_EBRACE) otherwise the new parser should be compatible with the old one. regcomp should be able to handle arbitrary pattern input if the pattern length is limited, the only exception is the use of large repetition counts (eg. (a{255}){255}) which require exp amount of memory and there is no easy workaround.
ec1aed0a · Szabolcs Nagy · bd082916 · ec1aed0a
显示空白变更内容
内联并排

Showing with 634 addition and 1081 deletion

src/regex/regcomp.c src/regex/regcomp.c +634 -1081

未找到文件。
--- a/src/regex/regcomp.c
+++ b/src/regex/regcomp.c
@@ -34,6 +34,7 @@
 #include <regex.h>
 #include <limits.h>
 #include <stdint.h>
+#include <ctype.h>
 #include "tre.h"
@@ -135,39 +136,17 @@ typedef struct {
  tre_ast_node_t *right;
 } tre_union_t;
-static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
-static tre_ast_node_t *
-tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
-static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
-		 int minimal);
-static tre_ast_node_t *
-tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
 static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
+tre_ast_new_node(tre_mem_t mem, int type, void *obj)
-		       tre_ast_node_t *right);
-static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
 {
-  tre_ast_node_t *node;
+	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
+	if (!node || !obj)
-  node = tre_mem_calloc(mem, sizeof(*node));
+		return 0;
-  if (!node)
+	node->obj = obj;
-    return NULL;
-  node->obj = tre_mem_calloc(mem, size);
-  if (!node->obj)
-    return NULL;
 	node->type = type;
 	node->nullable = -1;
 	node->submatch_id = -1;
 	return node;
 }
@@ -177,34 +156,31 @@ tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 	tre_ast_node_t *node;
 	tre_literal_t *lit;
-  node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
+	lit = tre_mem_calloc(mem, sizeof *lit);
+	node = tre_ast_new_node(mem, LITERAL, lit);
 	if (!node)
-    return NULL;
+		return 0;
-  lit = node->obj;
 	lit->code_min = code_min;
 	lit->code_max = code_max;
 	lit->position = position;
 	return node;
 }
 static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
-		 int minimal)
 {
 	tre_ast_node_t *node;
 	tre_iteration_t *iter;
-  node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
+	iter = tre_mem_calloc(mem, sizeof *iter);
+	node = tre_ast_new_node(mem, ITERATION, iter);
 	if (!node)
-    return NULL;
+		return 0;
-  iter = node->obj;
 	iter->arg = arg;
 	iter->min = min;
 	iter->max = max;
 	iter->minimal = minimal;
 	node->num_submatches = arg->num_submatches;
 	return node;
 }
@@ -212,30 +188,35 @@ static tre_ast_node_t *
 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 {
 	tre_ast_node_t *node;
+	tre_union_t *un;
-  node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
+	if (!left)
-  if (node == NULL)
+		return right;
-    return NULL;
+	un = tre_mem_calloc(mem, sizeof *un);
-  ((tre_union_t *)node->obj)->left = left;
+	node = tre_ast_new_node(mem, UNION, un);
-  ((tre_union_t *)node->obj)->right = right;
+	if (!node || !right)
+		return 0;
+	un->left = left;
+	un->right = right;
 	node->num_submatches = left->num_submatches + right->num_submatches;
 	return node;
 }
 static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
+tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
-		       tre_ast_node_t *right)
 {
 	tre_ast_node_t *node;
+	tre_catenation_t *cat;
-  node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
+	if (!left)
-  if (node == NULL)
+		return right;
-    return NULL;
+	cat = tre_mem_calloc(mem, sizeof *cat);
-  ((tre_catenation_t *)node->obj)->left = left;
+	node = tre_ast_new_node(mem, CATENATION, cat);
-  ((tre_catenation_t *)node->obj)->right = right;
+	if (!node)
+		return 0;
+	cat->left = left;
+	cat->right = right;
 	node->num_submatches = left->num_submatches + right->num_submatches;
 	return node;
 }
@@ -416,1073 +397,650 @@ typedef struct {
 	tre_mem_t mem;
 	/* Stack used for keeping track of regexp syntax. */
 	tre_stack_t *stack;
-  /* The parse result. */
+	/* The parsed node after a parse function returns. */
-  tre_ast_node_t *result;
+	tre_ast_node_t *n;
-  /* The regexp to parse and its length. */
+	/* Position in the regexp pattern after a parse function returns. */
+	const char *s;
+	/* The first character of the regexp. */
 	const char *re;
-  /* The first character of the entire regexp. */
-  const char *re_start;
 	/* Current submatch ID. */
 	int submatch_id;
 	/* Current position (number of literal). */
 	int position;
 	/* The highest back reference or -1 if none seen so far. */
 	int max_backref;
-  /* This flag is set if the regexp uses approximate matching. */
-  int have_approx;
 	/* Compilation flags. */
 	int cflags;
-  /* If this flag is set the top-level submatch is not captured. */
-  int nofirstsub;
 } tre_parse_ctx_t;
-/* Parses a wide character regexp pattern into a syntax tree.  This parser
-   handles both syntaxes (BRE and ERE), including the TRE extensions. */
-static reg_errcode_t
-tre_parse(tre_parse_ctx_t *ctx);
-/*
-  This parser is just a simple recursive descent parser for POSIX.2
-  regexps.  The parser supports both the obsolete default syntax and
-  the "extended" syntax, and some nonstandard extensions.
-*/
-/* Characters with special meanings in regexp syntax. */
-#define CHAR_PIPE	   '|'
-#define CHAR_LPAREN	   '('
-#define CHAR_RPAREN	   ')'
-#define CHAR_LBRACE	   '{'
-#define CHAR_RBRACE	   '}'
-#define CHAR_LBRACKET	   '['
-#define CHAR_RBRACKET	   ']'
-#define CHAR_MINUS	   '-'
-#define CHAR_STAR	   '*'
-#define CHAR_QUESTIONMARK  '?'
-#define CHAR_PLUS	   '+'
-#define CHAR_PERIOD	   '.'
-#define CHAR_COLON	   ':'
-#define CHAR_EQUAL	   '='
-#define CHAR_COMMA	   ','
-#define CHAR_CARET	   '^'
-#define CHAR_DOLLAR	   '$'
-#define CHAR_BACKSLASH	   '\\'
-#define CHAR_HASH	   '#'
-#define CHAR_TILDE	   '~'
 /* Some macros for expanding \w, \s, etc. */
-static const struct tre_macro_struct {
+static const struct {
-  const char c;
+	char c;
 	const char *expansion;
-} tre_macros[] =
+} tre_macros[] = {
-  { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
+	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
 	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
 	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
 	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
-    { 0, NULL }
+	{ 0, 0 }
-  };
+};
 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
   must have at least `len' items.  Sets buf[0] to zero if the there
   is no match in `tre_macros'. */
-static const char *
+static const char *tre_expand_macro(const char *s)
-tre_expand_macro(const char *regex)
 {
 	int i;
+	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
-  if (!*regex)
-    return 0;
-  for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
 	return tre_macros[i].expansion;
 }
-static reg_errcode_t
+static int
-tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
+tre_compare_lit(const void *a, const void *b)
-	 tre_ast_node_t ***items)
 {
-  reg_errcode_t status;
+	const tre_literal_t *const *la = a;
-  tre_ast_node_t **array = *items;
+	const tre_literal_t *const *lb = b;
-  /* Allocate more space if necessary. */
+	/* assumes the range of valid code_min is < INT_MAX */
-  if (*i >= *max_i)
+	return la[0]->code_min - lb[0]->code_min;
-    {
-      tre_ast_node_t **new_items;
-      /* If the array is already 1024 items large, give up -- there's
-	 probably an error in the regexp (e.g. not a '\0' terminated
-	 string and missing ']') */
-      if (*max_i > 1024)
-	return REG_ESPACE;
-      *max_i *= 2;
-      new_items = xrealloc(array, sizeof(*array) * *max_i);
-      if (new_items == NULL)
-	return REG_ESPACE;
-      *items = array = new_items;
-    }
-  array[*i] = tre_ast_new_literal(mem, min, max, -1);
-  status = array[*i] == NULL ? REG_ESPACE : REG_OK;
-  (*i)++;
-  return status;
 }
+struct literals {
+	tre_mem_t mem;
+	tre_literal_t **a;
+	int len;
+	int cap;
+};
-static int
+static tre_literal_t *tre_new_lit(struct literals *p)
-tre_compare_items(const void *a, const void *b)
 {
-  const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
+	tre_literal_t **a;
-  const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
+	if (p->len >= p->cap) {
-  tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
+		if (p->cap >= 1<<15)
-  int a_min = l_a->code_min, b_min = l_b->code_min;
+			return 0;
+		p->cap *= 2;
+		a = xrealloc(p->a, p->cap * sizeof *p->a);
+		if (!a)
+			return 0;
+		p->a = a;
+	}
+	a = p->a + p->len++;
+	*a = tre_mem_calloc(p->mem, sizeof **a);
+	return *a;
+}
-  if (a_min < b_min)
+static int add_icase_literals(struct literals *ls, int min, int max)
+{
+	tre_literal_t *lit;
+	int b, e, c;
+	for (c=min; c<=max; ) {
+		/* assumes islower(c) and isupper(c) are exclusive
+		   and toupper(c)!=c if islower(c).
+		   multiple opposite case characters are not supported */
+		if (tre_islower(c)) {
+			b = e = tre_toupper(c);
+			for (c++, e++; c<=max; c++, e++)
+				if (tre_toupper(c) != e) break;
+		} else if (tre_isupper(c)) {
+			b = e = tre_tolower(c);
+			for (c++, e++; c<=max; c++, e++)
+				if (tre_tolower(c) != e) break;
+		} else {
+			c++;
+			continue;
+		}
+		lit = tre_new_lit(ls);
+		if (!lit)
 			return -1;
-  else if (a_min > b_min)
+		lit->code_min = b;
-    return 1;
+		lit->code_max = e-1;
-  else
+		lit->position = -1;
+	}
 	return 0;
 }
-/* Maximum number of character classes that can occur in a negated bracket
-   expression.	*/
-#define MAX_NEG_CLASSES 64
-/* Maximum length of character class names. */
+/* Maximum number of character classes in a negated bracket expression. */
-#define MAX_CLASS_NAME
+#define MAX_NEG_CLASSES 64
-static reg_errcode_t
+struct neg {
-tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
+	int negate;
-			tre_ctype_t neg_classes[], int *num_neg_classes,
+	int len;
-			tre_ast_node_t ***items, int *num_items,
+	tre_ctype_t a[MAX_NEG_CLASSES];
-			int *items_size)
+};
-{
-  const char *re = ctx->re;
-  reg_errcode_t status = REG_OK;
-  tre_ctype_t class = (tre_ctype_t)0;
-  int i = *num_items;
-  int max_i = *items_size;
-  int skip;
-  /* Build an array of the items in the bracket expression. */
+// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
-  while (status == REG_OK)
-    {
-      skip = 0;
-      if (!*re)
-	{
-	  status = REG_EBRACK;
-	}
-      else if (*re == CHAR_RBRACKET && re > ctx->re)
-	{
-	  re++;
-	  break;
-	}
-      else
-	{
-	  tre_cint_t min = 0, max = 0;
-	  wchar_t wc;
-	  int clen = mbtowc(&wc, re, -1);
-	  if (clen<0) clen=1, wc=WEOF;
+/*
+bracket grammar:
+Bracket  =  '[' List ']'  |  '[^' List ']'
+List     =  Term  |  List Term
+Term     =  Char  |  Range  |  Chclass  |  Eqclass
+Range    =  Char '-' Char  |  Char '-' '-'
+Char     =  Coll  |  coll_single
+Meta     =  ']'  |  '-'
+Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
+Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
+Chclass  =  '[:' class ':]'
+coll_single is a single char collating element but it can be
+ '-' only at the beginning or end of a List and
+ ']' only at the beginning of a List and
+ '^' anywhere except after the openning '['
+*/
-	  class = (tre_ctype_t)0;
+static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
-	  if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
+{
-	    {
+	const char *start = s;
-	      min = wc;
+	tre_ctype_t class;
-	      re += clen+1;
+	int min, max;
-	      clen = mbtowc(&wc, re, -1);
+	wchar_t wc;
-	      if (clen<0) clen=1, wc=WEOF;
-	      max = wc;
-	      re += clen;
-	      /* XXX - Should use collation order instead of encoding values
-		 in character ranges. */
-	      if (min > max)
-		status = REG_ERANGE;
-	    }
-	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
-	    status = REG_ECOLLATE;
-	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
-	    status = REG_ECOLLATE;
-	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
-	    {
-	      char tmp_str[64];
-	      const char *endptr = re + 2;
 	int len;
-	      while (*endptr && *endptr != CHAR_COLON)
-		endptr++;
+	for (;;) {
-	      if (*endptr)
+		class = 0;
-		{
+		len = mbtowc(&wc, s, -1);
-		  len = MIN(endptr - re - 2, 63);
+		if (len <= 0)
-		  strncpy(tmp_str, re + 2, len);
+			return *s ? REG_BADPAT : REG_EBRACK;
-		  tmp_str[len] = '\0';
+		if (*s == ']' && s != start) {
-		  class = tre_ctype(tmp_str);
+			ctx->s = s+1;
-		  if (!class)
+			return REG_OK;
-		    status = REG_ECTYPE;
-		  re = endptr + 2;
 		}
-	      else
+		if (*s == '-' && s != start && s[1] != ']' &&
-		status = REG_ECTYPE;
+		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
+		    (s[1] != '-' || s[2] == ']'))
+			return REG_ERANGE;
+		if (*s == '[' && (s[1] == '.' || s[1] == '='))
+			/* collating symbols and equivalence classes are not supported */
+			return REG_ECOLLATE;
+		if (*s == '[' && s[1] == ':') {
+			char tmp[CHARCLASS_NAME_MAX+1];
+			s += 2;
+			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
+				if (s[len] == ':') {
+					memcpy(tmp, s, len);
+					tmp[len] = 0;
+					class = tre_ctype(tmp);
+					break;
+				}
+			}
+			if (!class || s[len+1] != ']')
+				return REG_ECTYPE;
 			min = 0;
 			max = TRE_CHAR_MAX;
-	    }
+			s += len+2;
-	  else
+		} else {
-	    {
-	      if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
-		  && ctx->re != re)
-		/* Two ranges are not allowed to share and endpoint. */
-		status = REG_ERANGE;
 			min = max = wc;
-	      re += clen;
+			s += len;
+			if (*s == '-' && s[1] != ']') {
+				s++;
+				len = mbtowc(&wc, s, -1);
+				max = wc;
+				/* XXX - Should use collation order instead of
+				   encoding values in character ranges. */
+				if (len <= 0 || min > max)
+					return REG_ERANGE;
+				s += len;
 			}
-	  if (status != REG_OK)
-	    break;
-	  if (class && negate)
-	    if (*num_neg_classes >= MAX_NEG_CLASSES)
-	      status = REG_ESPACE;
-	    else
-	      neg_classes[(*num_neg_classes)++] = class;
-	  else if (!skip)
-	    {
-	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
-	      if (status != REG_OK)
-		break;
-	      ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
 		}
-	  /* Add opposite-case counterpoints if REG_ICASE is present.
+		if (class && neg->negate) {
-	     This is broken if there are more than two "same" characters. */
+			if (neg->len >= MAX_NEG_CLASSES)
-	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
+				return REG_ESPACE;
-	    {
+			neg->a[neg->len++] = class;
-	      tre_cint_t cmin, ccurr;
+		} else  {
+			tre_literal_t *lit = tre_new_lit(ls);
-	      while (min <= max)
+			if (!lit)
-		{
+				return REG_ESPACE;
-		  if (tre_islower(min))
+			lit->code_min = min;
-		    {
+			lit->code_max = max;
-		      cmin = ccurr = tre_toupper(min++);
+			lit->class = class;
-		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1
+			lit->position = -1;
-			     && min <= max)
-			ccurr = tre_toupper(min++);
+			/* Add opposite-case codepoints if REG_ICASE is present.
-		      status = tre_new_item(ctx->mem, cmin, ccurr,
+			   It seems that POSIX requires that bracket negation
-					    &i, &max_i, items);
+			   should happen before case-folding, but most practical
-		    }
+			   implementations do it the other way around. Changing
-		  else if (tre_isupper(min))
+			   the order would need efficient representation of
-		    {
+			   case-fold ranges and bracket range sets even with
-		      cmin = ccurr = tre_tolower(min++);
+			   simple patterns so this is ok for now. */
-		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
+			if (ctx->cflags & REG_ICASE && !class)
-			     && min <= max)
+				if (add_icase_literals(ls, min, max))
-			ccurr = tre_tolower(min++);
+					return REG_ESPACE;
-		      status = tre_new_item(ctx->mem, cmin, ccurr,
-					    &i, &max_i, items);
-		    }
-		  else min++;
-		  if (status != REG_OK)
-		    break;
-		}
-	      if (status != REG_OK)
-		break;
-	    }
 		}
 	}
-  *num_items = i;
-  *items_size = max_i;
-  ctx->re = re;
-  return status;
 }
-static reg_errcode_t
+static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
-tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 {
-  tre_ast_node_t *node = NULL;
+	int i, max, min, negmax, negmin;
-  int negate = 0;
+	tre_ast_node_t *node = 0, *n;
-  reg_errcode_t status = REG_OK;
+	tre_ctype_t *nc = 0;
-  tre_ast_node_t **items, *u, *n;
+	tre_literal_t *lit;
-  int i = 0, j, max_i = 32, curr_max, curr_min;
+	struct literals ls;
-  tre_ctype_t neg_classes[MAX_NEG_CLASSES];
+	struct neg neg;
-  int num_neg_classes = 0;
+	reg_errcode_t err;
-  /* Start off with an array of `max_i' elements. */
+	ls.mem = ctx->mem;
-  items = xmalloc(sizeof(*items) * max_i);
+	ls.len = 0;
-  if (items == NULL)
+	ls.cap = 32;
+	ls.a = xmalloc(ls.cap * sizeof *ls.a);
+	if (!ls.a)
 		return REG_ESPACE;
+	neg.len = 0;
+	neg.negate = *s == '^';
+	if (neg.negate)
+		s++;
-  if (*ctx->re == CHAR_CARET)
+	err = parse_bracket_terms(ctx, s, &ls, &neg);
-    {
+	if (err != REG_OK)
-      negate = 1;
-      ctx->re++;
-    }
-  status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
-				   &items, &i, &max_i);
-  if (status != REG_OK)
 		goto parse_bracket_done;
+	if (neg.negate) {
 		/* Sort the array if we need to negate it. */
-  if (negate)
+		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
-    qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
+		/* extra lit for the last negated range */
+		lit = tre_new_lit(&ls);
-  curr_max = curr_min = 0;
+		if (!lit) {
-  /* Build a union of the items in the array, negated if necessary. */
+			err = REG_ESPACE;
-  for (j = 0; j < i && status == REG_OK; j++)
+			goto parse_bracket_done;
-    {
-      int min, max;
-      tre_literal_t *l = items[j]->obj;
-      min = l->code_min;
-      max = l->code_max;
-      if (negate)
-	{
-	  if (min < curr_max)
-	    {
-	      /* Overlap. */
-	      curr_max = MAX(max + 1, curr_max);
-	      l = NULL;
-	    }
-	  else
-	    {
-	      /* No overlap. */
-	      curr_max = min - 1;
-	      if (curr_max >= curr_min)
-		{
-		  l->code_min = curr_min;
-		  l->code_max = curr_max;
-		}
-	      else
-		{
-		  l = NULL;
-		}
-	      curr_min = curr_max = max + 1;
-	    }
-	}
-      if (l != NULL)
-	{
-	  int k;
-	  l->position = ctx->position;
-	  if (num_neg_classes > 0)
-	    {
-	      l->neg_classes = tre_mem_alloc(ctx->mem,
-					     (sizeof(*l->neg_classes)
-					      * (num_neg_classes + 1)));
-	      if (l->neg_classes == NULL)
-		{
-		  status = REG_ESPACE;
-		  break;
-		}
-	      for (k = 0; k < num_neg_classes; k++)
-		l->neg_classes[k] = neg_classes[k];
-	      l->neg_classes[k] = (tre_ctype_t)0;
 		}
-	  else
+		lit->code_min = TRE_CHAR_MAX+1;
-	    l->neg_classes = NULL;
+		lit->code_max = TRE_CHAR_MAX+1;
-	  if (node == NULL)
+		lit->position = -1;
-	    node = items[j];
+		/* negated classes */
-	  else
+		if (neg.len) {
-	    {
+			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
-	      u = tre_ast_new_union(ctx->mem, node, items[j]);
+			if (!nc) {
-	      if (u == NULL)
+				err = REG_ESPACE;
-		status = REG_ESPACE;
+				goto parse_bracket_done;
-	      node = u;
 			}
+			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
+			nc[neg.len] = 0;
 		}
 	}
-  if (status != REG_OK)
+	/* Build a union of the items in the array, negated if necessary. */
-    goto parse_bracket_done;
+	negmax = negmin = 0;
+	for (i = 0; i < ls.len; i++) {
-  if (negate)
+		lit = ls.a[i];
-    {
+		min = lit->code_min;
-      int k;
+		max = lit->code_max;
-      n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
+		if (neg.negate) {
-      if (n == NULL)
+			if (min <= negmin) {
-	status = REG_ESPACE;
+				/* Overlap. */
-      else
+				negmin = MAX(max + 1, negmin);
-	{
+				continue;
-	  tre_literal_t *l = n->obj;
-	  if (num_neg_classes > 0)
-	    {
-	      l->neg_classes = tre_mem_alloc(ctx->mem,
-					     (sizeof(*l->neg_classes)
-					      * (num_neg_classes + 1)));
-	      if (l->neg_classes == NULL)
-		{
-		  status = REG_ESPACE;
-		  goto parse_bracket_done;
 			}
-	      for (k = 0; k < num_neg_classes; k++)
+			negmax = min - 1;
-		l->neg_classes[k] = neg_classes[k];
+			lit->code_min = negmin;
-	      l->neg_classes[k] = (tre_ctype_t)0;
+			lit->code_max = negmax;
-	    }
+			negmin = max + 1;
-	  else
-	    l->neg_classes = NULL;
-	  if (node == NULL)
-	    node = n;
-	  else
-	    {
-	      u = tre_ast_new_union(ctx->mem, node, n);
-	      if (u == NULL)
-		status = REG_ESPACE;
-	      node = u;
 		}
+		lit->position = ctx->position;
+		lit->neg_classes = nc;
+		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
+		node = tre_ast_new_union(ctx->mem, node, n);
+		if (!node) {
+			err = REG_ESPACE;
+			break;
 		}
 	}
-  if (status != REG_OK)
+parse_bracket_done:
-    goto parse_bracket_done;
+	xfree(ls.a);
-#ifdef TRE_DEBUG
-  tre_ast_print(node);
-#endif /* TRE_DEBUG */
- parse_bracket_done:
-  xfree(items);
 	ctx->position++;
-  *result = node;
+	ctx->n = node;
-  return status;
+	return err;
 }
+static const char *parse_dup_count(const char *s, int *n)
-/* Parses a positive decimal integer.  Returns -1 if the string does not
-   contain a valid number. */
-static int
-tre_parse_int(const char **regex)
 {
-  int num = -1;
+	*n = -1;
-  const char *r = *regex;
+	if (!isdigit(*s))
-  while (*r-'0'<10U)
+		return s;
-    {
+	*n = 0;
-      if (num < 0)
+	for (;;) {
-	num = 0;
+		*n = 10 * *n + (*s - '0');
-      num = num * 10 + *r - '0';
+		s++;
-      r++;
+		if (!isdigit(*s) || *n > RE_DUP_MAX)
+			break;
 	}
-  *regex = r;
+	return s;
-  return num;
 }
+static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
-static reg_errcode_t
-tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 {
 	int min, max;
-  const char *r = ctx->re;
-  int minimal = 0;
-  /* Parse number (minimum repetition count). */
-  min = -1;
-  if (*r >= '0' && *r <= '9') {
-    min = tre_parse_int(&r);
-  }
-  /* Parse comma and second number (maximum repetition count). */
+	s = parse_dup_count(s, &min);
+	if (*s == ',')
+		s = parse_dup_count(s+1, &max);
+	else
 		max = min;
-  if (*r == CHAR_COMMA)
-    {
-      r++;
-      max = tre_parse_int(&r);
-    }
-  /* Check that the repeat counts are sane. */
+	if (
-  if ((max >= 0 && min > max) || max > RE_DUP_MAX)
+		(max < min && max >= 0) ||
+		max > RE_DUP_MAX ||
+		min > RE_DUP_MAX ||
+		min < 0 ||
+		(!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
+		*s++ != '}'
+	)
 		return REG_BADBR;
-  /* Missing }. */
-  if (!*r)
-    return REG_EBRACE;
-  /* Empty contents of {}. */
-  if (r == ctx->re)
-    return REG_BADBR;
-  /* Parse the ending '}' or '\}'.*/
-  if (ctx->cflags & REG_EXTENDED)
-    {
-      if (*r != CHAR_RBRACE)
-	return REG_BADBR;
-      r++;
-    }
-  else
-    {
-      if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
-	return REG_BADBR;
-      r += 2;
-    }
-  /* Create the AST node(s). */
 	if (min == 0 && max == 0)
-    {
+		ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-      *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-      if (*result == NULL)
-	return REG_ESPACE;
-    }
 	else
-    {
+		ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
-      if (min < 0 && max < 0)
+	if (!ctx->n)
-	/* Only approximate parameters set, no repetitions. */
-	min = max = 1;
-      *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
-      if (!*result)
 		return REG_ESPACE;
-    }
+	ctx->s = s;
-  ctx->re = r;
 	return REG_OK;
 }
-typedef enum {
+static int hexval(unsigned c)
-  PARSE_RE = 0,
-  PARSE_ATOM,
-  PARSE_MARK_FOR_SUBMATCH,
-  PARSE_BRANCH,
-  PARSE_PIECE,
-  PARSE_CATENATION,
-  PARSE_POST_CATENATION,
-  PARSE_UNION,
-  PARSE_POST_UNION,
-  PARSE_POSTFIX,
-  PARSE_RESTORE_CFLAGS
-} tre_parse_re_stack_symbol_t;
-static reg_errcode_t
-tre_parse(tre_parse_ctx_t *ctx)
 {
-  tre_ast_node_t *result = NULL;
+	if (c-'0'<10) return c-'0';
-  tre_parse_re_stack_symbol_t symbol;
+	c |= 32;
-  reg_errcode_t status = REG_OK;
+	if (c-'a'<6) return c-'a'+10;
-  tre_stack_t *stack = ctx->stack;
+	return -1;
-  int bottom = tre_stack_num_objects(stack);
+}
-  int depth = 0;
-  wchar_t wc;
-  int clen;
-  if (!ctx->nofirstsub)
-    {
-      STACK_PUSH(stack, int, ctx->submatch_id);
-      STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
-      ctx->submatch_id++;
-    }
-  STACK_PUSH(stack, int, PARSE_RE);
-  ctx->re_start = ctx->re;
-  /* The following is basically just a recursive descent parser.  I use
-     an explicit stack instead of recursive functions mostly because of
-     two reasons: compatibility with systems which have an overflowable
-     call stack, and efficiency (both in lines of code and speed).  */
-  while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
-    {
-      if (status != REG_OK)
-	break;
-      symbol = tre_stack_pop_int(stack);
-      switch (symbol)
-	{
-	case PARSE_RE:
-	  /* Parse a full regexp.  A regexp is one or more branches,
-	     separated by the union operator `|'. */
-	  if (ctx->cflags & REG_EXTENDED)
-	    STACK_PUSHX(stack, int, PARSE_UNION);
-	  STACK_PUSHX(stack, int, PARSE_BRANCH);
-	  break;
-	case PARSE_BRANCH:
-	  /* Parse a branch.  A branch is one or more pieces, concatenated.
-	     A piece is an atom possibly followed by a postfix operator. */
-	  STACK_PUSHX(stack, int, PARSE_CATENATION);
-	  STACK_PUSHX(stack, int, PARSE_PIECE);
-	  break;
-	case PARSE_PIECE:
-	  /* Parse a piece.  A piece is an atom possibly followed by one
-	     or more postfix operators. */
-	    STACK_PUSHX(stack, int, PARSE_POSTFIX);
-	  STACK_PUSHX(stack, int, PARSE_ATOM);
-	  break;
-	case PARSE_CATENATION:
-	  /* If the expression has not ended, parse another piece. */
-	  {
-	    tre_char_t c;
-	    if (!*ctx->re)
-	      break;
-	    c = *ctx->re;
-		if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
-		  break;
-		if ((ctx->cflags & REG_EXTENDED
-		     && c == CHAR_RPAREN && depth > 0)
-		    || (!(ctx->cflags & REG_EXTENDED)
-			&& (c == CHAR_BACKSLASH
-			    && *(ctx->re + 1) == CHAR_RPAREN)))
-		  {
-		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
-		      status = REG_EPAREN;
-		    depth--;
-		    if (!(ctx->cflags & REG_EXTENDED))
-		      ctx->re += 2;
-		    break;
-		  }
-	      {
-		/* Default case, left associative concatenation. */
-		STACK_PUSHX(stack, int, PARSE_CATENATION);
-		STACK_PUSHX(stack, voidptr, result);
-		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
-		STACK_PUSHX(stack, int, PARSE_PIECE);
-	      }
-	    break;
-	  }
-	case PARSE_POST_CATENATION:
-	  {
-	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
-	    tre_ast_node_t *tmp_node;
-	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
-	    if (!tmp_node)
-	      return REG_ESPACE;
-	    result = tmp_node;
-	    break;
-	  }
-	case PARSE_UNION:
-	  switch (*ctx->re)
-	    {
-	    case CHAR_PIPE:
-	      STACK_PUSHX(stack, int, PARSE_UNION);
-	      STACK_PUSHX(stack, voidptr, result);
-	      STACK_PUSHX(stack, int, PARSE_POST_UNION);
-	      STACK_PUSHX(stack, int, PARSE_BRANCH);
-	      ctx->re++;
-	      break;
-	    case CHAR_RPAREN:
-	      ctx->re++;
-	      break;
-	    default:
-	      break;
-	    }
-	  break;
-	case PARSE_POST_UNION:
+static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
-	  {
+{
-	    tre_ast_node_t *tmp_node;
+	if (node->submatch_id >= 0) {
-	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
+		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
+		if (!n)
-	    if (!tmp_node)
 			return REG_ESPACE;
-	    result = tmp_node;
+		n = tre_ast_new_catenation(ctx->mem, n, node);
-	    break;
+		if (!n)
-	  }
-	case PARSE_POSTFIX:
-	  /* Parse postfix operators. */
-	  switch (*ctx->re)
-	    {
-	    case CHAR_PLUS:
-	    case CHAR_QUESTIONMARK:
-	      if (!(ctx->cflags & REG_EXTENDED))
-		break;
-		/*FALLTHROUGH*/
-	    case CHAR_STAR:
-	      {
-		tre_ast_node_t *tmp_node;
-		int minimal = 0;
-		int rep_min = 0;
-		int rep_max = -1;
-		if (*ctx->re == CHAR_PLUS)
-		  rep_min = 1;
-		if (*ctx->re == CHAR_QUESTIONMARK)
-		  rep_max = 1;
-		ctx->re++;
-		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
-					    minimal);
-		if (tmp_node == NULL)
 			return REG_ESPACE;
-		result = tmp_node;
+		n->num_submatches = node->num_submatches;
-		STACK_PUSHX(stack, int, PARSE_POSTFIX);
+		node = n;
-	      }
-	      break;
-	    case CHAR_BACKSLASH:
-	      /* "\{" is special without REG_EXTENDED */
-	      if (!(ctx->cflags & REG_EXTENDED)
-		  && *(ctx->re + 1) == CHAR_LBRACE)
-		{
-		  ctx->re++;
-		  goto parse_brace;
-		}
-	      else
-		break;
-	    case CHAR_LBRACE:
-	      /* "{" is literal without REG_EXTENDED */
-	      if (!(ctx->cflags & REG_EXTENDED))
-		break;
-	    parse_brace:
-	      ctx->re++;
-	      status = tre_parse_bound(ctx, &result);
-	      if (status != REG_OK)
-		return status;
-	      STACK_PUSHX(stack, int, PARSE_POSTFIX);
-	      break;
 	}
-	  break;
+	node->submatch_id = subid;
+	node->num_submatches++;
+	ctx->n = node;
+	return REG_OK;
+}
-	case PARSE_ATOM:
+/*
-	  /* Parse an atom.  An atom is a regular expression enclosed in `()',
+BRE grammar:
-	     an empty set of `()', a bracket expression, `.', `^', `$',
+Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
-	     a `\' followed by a character, or a single character. */
+Branch =  Atom  |  Branch Atom
+Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
+Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
-	  switch (*ctx->re)
+(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
-	    {
-	    case CHAR_LPAREN:  /* parenthesized subexpression */
-	      if (ctx->cflags & REG_EXTENDED)
+ERE grammar:
-		{
+Regex  =  Branch  |  Regex '|' Branch
-		lparen:
+Branch =  Atom  |  Branch Atom
-		  depth++;
+Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
-		    {
+Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
-		      ctx->re++;
-		      /* First parse a whole RE, then mark the resulting tree
-			 for submatching. */
-		      STACK_PUSHX(stack, int, ctx->submatch_id);
-		      STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
-		      STACK_PUSHX(stack, int, PARSE_RE);
-		      ctx->submatch_id++;
-		    }
-		}
-	      else
-		goto parse_literal;
-	      break;
-	    case CHAR_LBRACKET: /* bracket expression */
+(a*+?, ^*, $+, \X, {, (|a) are unspecified)
-	      ctx->re++;
+*/
-	      status = tre_parse_bracket(ctx, &result);
-	      if (status != REG_OK)
-		return status;
-	      break;
-	    case CHAR_BACKSLASH:
-	      /* If this is "\(" or "\)" chew off the backslash and
-		 try again. */
-	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
-		{
-		  ctx->re++;
-		  goto lparen;
-		}
-	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
-		{
-		  goto empty_atom;
-		}
-	      /* If a macro is used, parse the expanded macro recursively. */
-	      {
-		const char *buf = tre_expand_macro(ctx->re + 1);
-		if (buf)
-		  {
-		    tre_parse_ctx_t subctx;
-		    memcpy(&subctx, ctx, sizeof(subctx));
-		    subctx.re = buf;
-		    subctx.nofirstsub = 1;
-		    status = tre_parse(&subctx);
-		    if (status != REG_OK)
-		      return status;
-		    ctx->re += 2;
-		    ctx->position = subctx.position;
-		    result = subctx.result;
-		    break;
-		  }
-	      }
-	      if (!ctx->re[1])
+static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
-		/* Trailing backslash. */
+{
+	int len, ere = ctx->cflags & REG_EXTENDED;
+	const char *p;
+	tre_ast_node_t *node;
+	wchar_t wc;
+	switch (*s) {
+	case '[':
+		return parse_bracket(ctx, s+1);
+	case '\\':
+		p = tre_expand_macro(s+1);
+		if (p) {
+			/* assume \X expansion is a single atom */
+			reg_errcode_t err = parse_atom(ctx, p);
+			ctx->s = s+2;
+			return err;
+		}
+		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
+		switch (*++s) {
+		case 0:
 			return REG_EESCAPE;
-	      ctx->re++;
-	      switch (*ctx->re)
-		{
 		case 'b':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
-					       ASSERT_AT_WB, -1);
-		  ctx->re++;
 			break;
 		case 'B':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
-					       ASSERT_AT_WB_NEG, -1);
-		  ctx->re++;
 			break;
 		case '<':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
-					       ASSERT_AT_BOW, -1);
-		  ctx->re++;
 			break;
 		case '>':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
-					       ASSERT_AT_EOW, -1);
-		  ctx->re++;
 			break;
 		case 'x':
-		  ctx->re++;
+			s++;
-		  if (ctx->re[0] != CHAR_LBRACE)
+			int i, v = 0, c;
-		    {
+			len = 2;
-		      /* 8 bit hex char. */
+			if (*s == '{') {
-		      char tmp[3] = {0, 0, 0};
+				len = 8;
-		      long val;
+				s++;
+			}
-		      if (tre_isxdigit(ctx->re[0]))
+			for (i=0; i<len && v<0x110000; i++) {
-			{
+				c = hexval(s[i]);
-			  tmp[0] = (char)ctx->re[0];
+				if (c < 0) break;
-			  ctx->re++;
+				v = 16*v + c;
 			}
-		      if (tre_isxdigit(ctx->re[0]))
+			s += i;
-			{
+			if (len == 8) {
-			  tmp[1] = (char)ctx->re[0];
+				if (*s != '}')
-			  ctx->re++;
-			}
-		      val = strtol(tmp, NULL, 16);
-		      result = tre_ast_new_literal(ctx->mem, (int)val,
-						   (int)val, ctx->position);
-		      ctx->position++;
-		      break;
-		    }
-		  else if (*ctx->re)
-		    {
-		      /* Wide char. */
-		      char tmp[32];
-		      long val;
-		      int i = 0;
-		      ctx->re++;
-		      while (*ctx->re && i < sizeof tmp)
-			{
-			  if (ctx->re[0] == CHAR_RBRACE)
-			    break;
-			  if (tre_isxdigit(ctx->re[0]))
-			    {
-			      tmp[i] = (char)ctx->re[0];
-			      i++;
-			      ctx->re++;
-			      continue;
-			    }
 					return REG_EBRACE;
+				s++;
 			}
-		      ctx->re++;
+			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
-		      tmp[i] = 0;
-		      val = strtol(tmp, NULL, 16);
-		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
-						   ctx->position);
 			ctx->position++;
+			s--;
 			break;
-		    }
-		  /*FALLTHROUGH*/
 		default:
-		  if (tre_isdigit(*ctx->re))
+			if (isdigit(*s)) {
-		    {
+				/* back reference */
-		      /* Back reference. */
+				int val = *s - '0';
-		      int val = *ctx->re - '0';
+				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);
-		      result = tre_ast_new_literal(ctx->mem, BACKREF, val,
-						   ctx->position);
-		      if (result == NULL)
-			return REG_ESPACE;
-		      ctx->position++;
 				ctx->max_backref = MAX(val, ctx->max_backref);
-		      ctx->re++;
+			} else {
+				/* extension: accept unknown escaped char
+				   as a literal */
+				node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position);
 			}
-		  else
-		    {
-		      /* Escaped character. */
-		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
-						   ctx->position);
 			ctx->position++;
-		      ctx->re++;
 		}
+		s++;
 		break;
-		}
+	case '.':
-	      if (result == NULL)
+		if (ctx->cflags & REG_NEWLINE) {
-		return REG_ESPACE;
+			tre_ast_node_t *tmp1, *tmp2;
-	      break;
+			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
+			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
-	    case CHAR_PERIOD:	 /* the any-symbol */
+			if (tmp1 && tmp2)
-	      if (ctx->cflags & REG_NEWLINE)
+				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-		{
-		  tre_ast_node_t *tmp1;
-		  tre_ast_node_t *tmp2;
-		  tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
-					     ctx->position);
-		  if (!tmp1)
-		    return REG_ESPACE;
-		  tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
-					     ctx->position + 1);
-		  if (!tmp2)
-		    return REG_ESPACE;
-		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-		  if (!result)
-		    return REG_ESPACE;
-		  ctx->position += 2;
-		}
 			else
-		{
+				node = 0;
-		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
+		} else {
-					       ctx->position);
+			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
-		  if (!result)
-		    return REG_ESPACE;
-		  ctx->position++;
 		}
-	      ctx->re++;
+		s++;
 		break;
+	case '^':
-	    case CHAR_CARET:	 /* beginning of line assertion */
+		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
-	      /* '^' has a special meaning everywhere in EREs, and at
+		if (!ere && s != ctx->re)
-		 beginning of BRE. */
-	      if (ctx->cflags & REG_EXTENDED
-		  || ctx->re == ctx->re_start)
-		{
-		  if (!(ctx->cflags & REG_EXTENDED))
-		    STACK_PUSHX(stack, int, PARSE_CATENATION);
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_BOL, -1);
-		  if (result == NULL)
-		    return REG_ESPACE;
-		  ctx->re++;
-		}
-	      else
-		goto parse_literal;
-	      break;
-	    case CHAR_DOLLAR:	 /* end of line assertion. */
-	      /* '$' is special everywhere in EREs, and in the end of the
-		 string in BREs. */
-	      if (ctx->cflags & REG_EXTENDED
-		  || !*(ctx->re + 1))
-		{
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_EOL, -1);
-		  if (result == NULL)
-		    return REG_ESPACE;
-		  ctx->re++;
-		}
-	      else
 			goto parse_literal;
+		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
+		s++;
 		break;
+	case '$':
-	    case CHAR_RPAREN:
+		/* '$' is special everywhere in EREs, and in the end of the string in BREs. */
-	      if (!depth)
+		if (!ere && s[1])
 			goto parse_literal;
-	    case CHAR_STAR:
+		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
-	    case CHAR_PIPE:
+		s++;
-	    case CHAR_LBRACE:
+		break;
-	    case CHAR_PLUS:
+	case '*':
-	    case CHAR_QUESTIONMARK:
+	case '|':
-	      if (!(ctx->cflags & REG_EXTENDED))
+	case '{':
+	case '+':
+	case '?':
+		if (!ere)
 			goto parse_literal;
 	case 0:
-	    empty_atom:
+		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-	      result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-	      if (!result)
-		return REG_ESPACE;
 		break;
 	default:
-	    parse_literal:
+parse_literal:
+		len = mbtowc(&wc, s, -1);
-	      clen = mbtowc(&wc, ctx->re, -1);
+		if (len < 0)
-	      if (clen<0) clen=1, wc=WEOF;
+			return REG_BADPAT;
+		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
-	      /* Note that we can't use an tre_isalpha() test here, since there
+			tre_ast_node_t *tmp1, *tmp2;
-		 may be characters which are alphabetic but neither upper or
+			/* multiple opposite case characters are not supported */
-		 lower case. */
+			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
-	      if (ctx->cflags & REG_ICASE
+			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
-		  && (tre_isupper(wc) || tre_islower(wc)))
+			if (tmp1 && tmp2)
-		{
+				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-		  tre_ast_node_t *tmp1;
-		  tre_ast_node_t *tmp2;
-		  /* XXX - Can there be more than one opposite-case
-		     counterpoints for some character in some locale?  Or
-		     more than two characters which all should be regarded
-		     the same character if case is ignored?  If yes, there
-		     does not seem to be a portable way to detect it.  I guess
-		     that at least for multi-character collating elements there
-		     could be several opposite-case counterpoints, but they
-		     cannot be supported portably anyway. */
-		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
-					     tre_toupper(wc),
-					     ctx->position);
-		  if (!tmp1)
-		    return REG_ESPACE;
-		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
-					     tre_tolower(wc),
-					     ctx->position);
-		  if (!tmp2)
-		    return REG_ESPACE;
-		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-		  if (!result)
-		    return REG_ESPACE;
-		}
 			else
-		{
+				node = 0;
-		  result = tre_ast_new_literal(ctx->mem, wc, wc,
+		} else {
-					       ctx->position);
+			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
-		  if (!result)
-		    return REG_ESPACE;
 		}
 		ctx->position++;
-	      ctx->re += clen;
+		s += len;
 		break;
 	}
-	  break;
+	if (!node)
+		return REG_ESPACE;
+	ctx->n = node;
+	ctx->s = s;
+	return REG_OK;
+}
-	case PARSE_MARK_FOR_SUBMATCH:
+#define PUSHPTR(err, s, v) do { \
-	  {
+	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
-	    int submatch_id = tre_stack_pop_int(stack);
+		return err; \
+} while(0)
-	    if (result->submatch_id >= 0)
+#define PUSHINT(err, s, v) do { \
-	      {
+	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
-		tre_ast_node_t *n, *tmp_node;
+		return err; \
-		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+} while(0)
-		if (n == NULL)
+static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
+{
+	tre_ast_node_t *nbranch=0, *nunion=0;
+	int ere = ctx->cflags & REG_EXTENDED;
+	const char *s = ctx->re;
+	int subid = 0;
+	int depth = 0;
+	reg_errcode_t err;
+	tre_stack_t *stack = ctx->stack;
+	PUSHINT(err, stack, subid++);
+	for (;;) {
+		if ((!ere && *s == '\\' && s[1] == '(') ||
+		    (ere && *s == '(')) {
+			PUSHPTR(err, stack, nunion);
+			PUSHPTR(err, stack, nbranch);
+			PUSHINT(err, stack, subid++);
+			s++;
+			if (!ere)
+				s++;
+			depth++;
+			nbranch = nunion = 0;
+			continue;
+		}
+		if ((!ere && *s == '\\' && s[1] == ')') ||
+		    (ere && *s == ')' && depth)) {
+			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+			if (!ctx->n)
 				return REG_ESPACE;
-		tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
+		} else {
-		if (tmp_node == NULL)
+			err = parse_atom(ctx, s);
+			if (err != REG_OK)
+				return err;
+			s = ctx->s;
+		}
+	parse_iter:
+		/* extension: repetitions are accepted after an empty node
+		   eg. (+), ^*, a$?, a|{2} */
+		switch (*s) {
+		case '+':
+		case '?':
+			if (!ere)
+				break;
+			/* fallthrough */
+		case '*':;
+			int min=0, max=-1;
+			if (*s == '+')
+				min = 1;
+			if (*s == '?')
+				max = 1;
+			s++;
+			ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+			if (!ctx->n)
 				return REG_ESPACE;
-		tmp_node->num_submatches = result->num_submatches;
+			/* extension: multiple consecutive *+?{,} is unspecified,
-		result = tmp_node;
+			   but (a+)+ has to be supported so accepting a++ makes
-	      }
+			   sense, note however that the RE_DUP_MAX limit can be
-	    result->submatch_id = submatch_id;
+			   circumvented: (a{255}){255} uses a lot of memory.. */
-	    result->num_submatches++;
+			goto parse_iter;
+		case '\\':
+			if (ere || s[1] != '{')
+				break;
+			s++;
+			goto parse_brace;
+		case '{':
+			if (!ere)
 				break;
+		parse_brace:
+			err = parse_dup(ctx, s+1);
+			if (err != REG_OK)
+				return err;
+			s = ctx->s;
+			goto parse_iter;
+		}
+		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
+		if ((ere && *s == '|') ||
+		    (ere && *s == ')' && depth) ||
+		    (!ere && *s == '\\' && s[1] == ')') ||
+		    !*s) {
+			/* extension: empty branch is unspecified (), (|a), (a|)
+			   here they are not rejected but match on empty string */
+			int c = *s;
+			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
+			nbranch = 0;
+			if (c != '|') {
+				if (c == '\\') {
+					if (!depth) return REG_EPAREN;
+					s+=2;
+				} else if (c == ')')
+					s++;
+				depth--;
+				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
+				if (err != REG_OK)
+					return err;
+				if (!c && depth<0) {
+					ctx->submatch_id = subid;
+					return REG_OK;
 				}
+				if (!c || depth<0)
-	case PARSE_RESTORE_CFLAGS:
+					return REG_EPAREN;
-	  ctx->cflags = tre_stack_pop_int(stack);
+				nbranch = tre_stack_pop_voidptr(stack);
-	  break;
+				nunion = tre_stack_pop_voidptr(stack);
+				goto parse_iter;
-	default:
+			}
-	  assert(0);
+			s++;
-	  break;
 		}
 	}
-  /* Check for missing closing parentheses. */
-  if (depth > 0)
-    return REG_EPAREN;
-  if (status == REG_OK)
-    ctx->result = result;
-  return status;
 }
 /***********************************************************************
 from tre-compile.c
 ***********************************************************************/
@@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
  if (errcode != REG_OK)
    ERROR_EXIT(errcode);
  preg->re_nsub = parse_ctx.submatch_id - 1;
-  tree = parse_ctx.result;
+  tree = parse_ctx.n;
-  /* Back references and approximate matching cannot currently be used
-     in the same regexp. */
-  if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
-    ERROR_EXIT(REG_BADPAT);
 #ifdef TRE_DEBUG
  tre_ast_print(tree);
@@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
  if (tnfa == NULL)
    ERROR_EXIT(REG_ESPACE);
  tnfa->have_backrefs = parse_ctx.max_backref >= 0;
-  tnfa->have_approx = parse_ctx.have_approx;
+  tnfa->have_approx = 0;
  tnfa->num_submatches = parse_ctx.submatch_id;
  /* Set up tags for submatch addressing.  If REG_NOSUB is set and the