提交 ec1aed0a 编写于 作者: S Szabolcs Nagy

rewrite the regex pattern parser in regcomp

The new code is a bit simpler and the generated code is about 1KB
smaller (on i386). The basic design was kept including internal
interfaces, TNFA generation was not touched.

The old tre parser had various issues:

[^aa-z]
negated overlapping ranges in a bracket expression were handled
incorrectly (eg [^aa-z] was handled as [^a] instead of [^a-z])

a{,2}
missing lower bound in a counted repetition should be an error,
but it was accepted with broken semantics: a{,2} was treated as
a{0,3}, the new parser rejects it

a{999,}
large min count was not rejected (a{5000,} failed with REG_ESPACE
due to reaching a stack limit), the new parser enforces the
RE_DUP_MAX limit

\xff
regcomp used to accept a pattern with illegal sequences in it
(treated them as empty expression so p\xffq matched pq) the new
parser rejects such patterns with REG_BADPAT or REG_ERANGE

[^b-fD-H] with REG_ICASE
old parser turned this into [^b-fB-F] because of the negated
overlapping range issue (see above), the new parser treats it
as [^b-hB-H], POSIX seems to require [^d-fD-F], but practical
implementations do case-folding first and negate the character
set later instead of the other way around. (Supporting the posix
way efficiently would require significant changes so it was left
as is, it is unclear if any application actually expects the
posix behaviour, this issue is raised on the austingroup tracker:
http://austingroupbugs.net/view.php?id=872 ).

another case-insensitive matching issue is that unicode case
folding rules can group more than two characters together while
towupper and towlower can only work for a pair of upper and
lower case characters, this is a limitation of POSIX so it is
not fixed.

invalid bracket and brace expressions may return different error
codes now (REG_ERANGE instead of REG_EBRACK or REG_BADBR instead
of REG_EBRACE) otherwise the new parser should be compatible with
the old one.

regcomp should be able to handle arbitrary pattern input if the
pattern length is limited, the only exception is the use of large
repetition counts (eg. (a{255}){255}) which require exp amount
of memory and there is no easy workaround.
上级 bd082916
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <regex.h> #include <regex.h>
#include <limits.h> #include <limits.h>
#include <stdint.h> #include <stdint.h>
#include <ctype.h>
#include "tre.h" #include "tre.h"
...@@ -135,39 +136,17 @@ typedef struct { ...@@ -135,39 +136,17 @@ typedef struct {
tre_ast_node_t *right; tre_ast_node_t *right;
} tre_union_t; } tre_union_t;
static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
static tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
static tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
int minimal);
static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
static tre_ast_node_t * static tre_ast_node_t *
tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_new_node(tre_mem_t mem, int type, void *obj)
tre_ast_node_t *right);
static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
{ {
tre_ast_node_t *node; tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
if (!node || !obj)
node = tre_mem_calloc(mem, sizeof(*node)); return 0;
if (!node) node->obj = obj;
return NULL;
node->obj = tre_mem_calloc(mem, size);
if (!node->obj)
return NULL;
node->type = type; node->type = type;
node->nullable = -1; node->nullable = -1;
node->submatch_id = -1; node->submatch_id = -1;
return node; return node;
} }
...@@ -177,34 +156,31 @@ tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) ...@@ -177,34 +156,31 @@ tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
tre_ast_node_t *node; tre_ast_node_t *node;
tre_literal_t *lit; tre_literal_t *lit;
node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); lit = tre_mem_calloc(mem, sizeof *lit);
node = tre_ast_new_node(mem, LITERAL, lit);
if (!node) if (!node)
return NULL; return 0;
lit = node->obj;
lit->code_min = code_min; lit->code_min = code_min;
lit->code_max = code_max; lit->code_max = code_max;
lit->position = position; lit->position = position;
return node; return node;
} }
static tre_ast_node_t * static tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
int minimal)
{ {
tre_ast_node_t *node; tre_ast_node_t *node;
tre_iteration_t *iter; tre_iteration_t *iter;
node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); iter = tre_mem_calloc(mem, sizeof *iter);
node = tre_ast_new_node(mem, ITERATION, iter);
if (!node) if (!node)
return NULL; return 0;
iter = node->obj;
iter->arg = arg; iter->arg = arg;
iter->min = min; iter->min = min;
iter->max = max; iter->max = max;
iter->minimal = minimal; iter->minimal = minimal;
node->num_submatches = arg->num_submatches; node->num_submatches = arg->num_submatches;
return node; return node;
} }
...@@ -212,30 +188,35 @@ static tre_ast_node_t * ...@@ -212,30 +188,35 @@ static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
{ {
tre_ast_node_t *node; tre_ast_node_t *node;
tre_union_t *un;
node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); if (!left)
if (node == NULL) return right;
return NULL; un = tre_mem_calloc(mem, sizeof *un);
((tre_union_t *)node->obj)->left = left; node = tre_ast_new_node(mem, UNION, un);
((tre_union_t *)node->obj)->right = right; if (!node || !right)
return 0;
un->left = left;
un->right = right;
node->num_submatches = left->num_submatches + right->num_submatches; node->num_submatches = left->num_submatches + right->num_submatches;
return node; return node;
} }
static tre_ast_node_t * static tre_ast_node_t *
tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
tre_ast_node_t *right)
{ {
tre_ast_node_t *node; tre_ast_node_t *node;
tre_catenation_t *cat;
node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); if (!left)
if (node == NULL) return right;
return NULL; cat = tre_mem_calloc(mem, sizeof *cat);
((tre_catenation_t *)node->obj)->left = left; node = tre_ast_new_node(mem, CATENATION, cat);
((tre_catenation_t *)node->obj)->right = right; if (!node)
return 0;
cat->left = left;
cat->right = right;
node->num_submatches = left->num_submatches + right->num_submatches; node->num_submatches = left->num_submatches + right->num_submatches;
return node; return node;
} }
...@@ -416,1073 +397,650 @@ typedef struct { ...@@ -416,1073 +397,650 @@ typedef struct {
tre_mem_t mem; tre_mem_t mem;
/* Stack used for keeping track of regexp syntax. */ /* Stack used for keeping track of regexp syntax. */
tre_stack_t *stack; tre_stack_t *stack;
/* The parse result. */ /* The parsed node after a parse function returns. */
tre_ast_node_t *result; tre_ast_node_t *n;
/* The regexp to parse and its length. */ /* Position in the regexp pattern after a parse function returns. */
const char *s;
/* The first character of the regexp. */
const char *re; const char *re;
/* The first character of the entire regexp. */
const char *re_start;
/* Current submatch ID. */ /* Current submatch ID. */
int submatch_id; int submatch_id;
/* Current position (number of literal). */ /* Current position (number of literal). */
int position; int position;
/* The highest back reference or -1 if none seen so far. */ /* The highest back reference or -1 if none seen so far. */
int max_backref; int max_backref;
/* This flag is set if the regexp uses approximate matching. */
int have_approx;
/* Compilation flags. */ /* Compilation flags. */
int cflags; int cflags;
/* If this flag is set the top-level submatch is not captured. */
int nofirstsub;
} tre_parse_ctx_t; } tre_parse_ctx_t;
/* Parses a wide character regexp pattern into a syntax tree. This parser
handles both syntaxes (BRE and ERE), including the TRE extensions. */
static reg_errcode_t
tre_parse(tre_parse_ctx_t *ctx);
/*
This parser is just a simple recursive descent parser for POSIX.2
regexps. The parser supports both the obsolete default syntax and
the "extended" syntax, and some nonstandard extensions.
*/
/* Characters with special meanings in regexp syntax. */
#define CHAR_PIPE '|'
#define CHAR_LPAREN '('
#define CHAR_RPAREN ')'
#define CHAR_LBRACE '{'
#define CHAR_RBRACE '}'
#define CHAR_LBRACKET '['
#define CHAR_RBRACKET ']'
#define CHAR_MINUS '-'
#define CHAR_STAR '*'
#define CHAR_QUESTIONMARK '?'
#define CHAR_PLUS '+'
#define CHAR_PERIOD '.'
#define CHAR_COLON ':'
#define CHAR_EQUAL '='
#define CHAR_COMMA ','
#define CHAR_CARET '^'
#define CHAR_DOLLAR '$'
#define CHAR_BACKSLASH '\\'
#define CHAR_HASH '#'
#define CHAR_TILDE '~'
/* Some macros for expanding \w, \s, etc. */ /* Some macros for expanding \w, \s, etc. */
static const struct tre_macro_struct { static const struct {
const char c; char c;
const char *expansion; const char *expansion;
} tre_macros[] = } tre_macros[] = {
{ {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
{'f', "\f"}, {'a', "\a"}, {'e', "\033"}, {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
{ 0, NULL } { 0, 0 }
}; };
/* Expands a macro delimited by `regex' and `regex_end' to `buf', which /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
must have at least `len' items. Sets buf[0] to zero if the there must have at least `len' items. Sets buf[0] to zero if the there
is no match in `tre_macros'. */ is no match in `tre_macros'. */
static const char * static const char *tre_expand_macro(const char *s)
tre_expand_macro(const char *regex)
{ {
int i; int i;
for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
if (!*regex)
return 0;
for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
return tre_macros[i].expansion; return tre_macros[i].expansion;
} }
static reg_errcode_t static int
tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, tre_compare_lit(const void *a, const void *b)
tre_ast_node_t ***items)
{ {
reg_errcode_t status; const tre_literal_t *const *la = a;
tre_ast_node_t **array = *items; const tre_literal_t *const *lb = b;
/* Allocate more space if necessary. */ /* assumes the range of valid code_min is < INT_MAX */
if (*i >= *max_i) return la[0]->code_min - lb[0]->code_min;
{
tre_ast_node_t **new_items;
/* If the array is already 1024 items large, give up -- there's
probably an error in the regexp (e.g. not a '\0' terminated
string and missing ']') */
if (*max_i > 1024)
return REG_ESPACE;
*max_i *= 2;
new_items = xrealloc(array, sizeof(*array) * *max_i);
if (new_items == NULL)
return REG_ESPACE;
*items = array = new_items;
}
array[*i] = tre_ast_new_literal(mem, min, max, -1);
status = array[*i] == NULL ? REG_ESPACE : REG_OK;
(*i)++;
return status;
} }
struct literals {
tre_mem_t mem;
tre_literal_t **a;
int len;
int cap;
};
static int static tre_literal_t *tre_new_lit(struct literals *p)
tre_compare_items(const void *a, const void *b)
{ {
const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; tre_literal_t **a;
const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; if (p->len >= p->cap) {
tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; if (p->cap >= 1<<15)
int a_min = l_a->code_min, b_min = l_b->code_min; return 0;
p->cap *= 2;
a = xrealloc(p->a, p->cap * sizeof *p->a);
if (!a)
return 0;
p->a = a;
}
a = p->a + p->len++;
*a = tre_mem_calloc(p->mem, sizeof **a);
return *a;
}
if (a_min < b_min) static int add_icase_literals(struct literals *ls, int min, int max)
{
tre_literal_t *lit;
int b, e, c;
for (c=min; c<=max; ) {
/* assumes islower(c) and isupper(c) are exclusive
and toupper(c)!=c if islower(c).
multiple opposite case characters are not supported */
if (tre_islower(c)) {
b = e = tre_toupper(c);
for (c++, e++; c<=max; c++, e++)
if (tre_toupper(c) != e) break;
} else if (tre_isupper(c)) {
b = e = tre_tolower(c);
for (c++, e++; c<=max; c++, e++)
if (tre_tolower(c) != e) break;
} else {
c++;
continue;
}
lit = tre_new_lit(ls);
if (!lit)
return -1; return -1;
else if (a_min > b_min) lit->code_min = b;
return 1; lit->code_max = e-1;
else lit->position = -1;
}
return 0; return 0;
} }
/* Maximum number of character classes that can occur in a negated bracket
expression. */
#define MAX_NEG_CLASSES 64
/* Maximum length of character class names. */ /* Maximum number of character classes in a negated bracket expression. */
#define MAX_CLASS_NAME #define MAX_NEG_CLASSES 64
static reg_errcode_t struct neg {
tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, int negate;
tre_ctype_t neg_classes[], int *num_neg_classes, int len;
tre_ast_node_t ***items, int *num_items, tre_ctype_t a[MAX_NEG_CLASSES];
int *items_size) };
{
const char *re = ctx->re;
reg_errcode_t status = REG_OK;
tre_ctype_t class = (tre_ctype_t)0;
int i = *num_items;
int max_i = *items_size;
int skip;
/* Build an array of the items in the bracket expression. */ // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
while (status == REG_OK)
{
skip = 0;
if (!*re)
{
status = REG_EBRACK;
}
else if (*re == CHAR_RBRACKET && re > ctx->re)
{
re++;
break;
}
else
{
tre_cint_t min = 0, max = 0;
wchar_t wc;
int clen = mbtowc(&wc, re, -1);
if (clen<0) clen=1, wc=WEOF; /*
bracket grammar:
Bracket = '[' List ']' | '[^' List ']'
List = Term | List Term
Term = Char | Range | Chclass | Eqclass
Range = Char '-' Char | Char '-' '-'
Char = Coll | coll_single
Meta = ']' | '-'
Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]'
Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]'
Chclass = '[:' class ':]'
coll_single is a single char collating element but it can be
'-' only at the beginning or end of a List and
']' only at the beginning of a List and
'^' anywhere except after the openning '['
*/
class = (tre_ctype_t)0; static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET) {
{ const char *start = s;
min = wc; tre_ctype_t class;
re += clen+1; int min, max;
clen = mbtowc(&wc, re, -1); wchar_t wc;
if (clen<0) clen=1, wc=WEOF;
max = wc;
re += clen;
/* XXX - Should use collation order instead of encoding values
in character ranges. */
if (min > max)
status = REG_ERANGE;
}
else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
status = REG_ECOLLATE;
else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
status = REG_ECOLLATE;
else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
{
char tmp_str[64];
const char *endptr = re + 2;
int len; int len;
while (*endptr && *endptr != CHAR_COLON)
endptr++; for (;;) {
if (*endptr) class = 0;
{ len = mbtowc(&wc, s, -1);
len = MIN(endptr - re - 2, 63); if (len <= 0)
strncpy(tmp_str, re + 2, len); return *s ? REG_BADPAT : REG_EBRACK;
tmp_str[len] = '\0'; if (*s == ']' && s != start) {
class = tre_ctype(tmp_str); ctx->s = s+1;
if (!class) return REG_OK;
status = REG_ECTYPE;
re = endptr + 2;
} }
else if (*s == '-' && s != start && s[1] != ']' &&
status = REG_ECTYPE; /* extension: [a-z--@] is accepted as [a-z]|[--@] */
(s[1] != '-' || s[2] == ']'))
return REG_ERANGE;
if (*s == '[' && (s[1] == '.' || s[1] == '='))
/* collating symbols and equivalence classes are not supported */
return REG_ECOLLATE;
if (*s == '[' && s[1] == ':') {
char tmp[CHARCLASS_NAME_MAX+1];
s += 2;
for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
if (s[len] == ':') {
memcpy(tmp, s, len);
tmp[len] = 0;
class = tre_ctype(tmp);
break;
}
}
if (!class || s[len+1] != ']')
return REG_ECTYPE;
min = 0; min = 0;
max = TRE_CHAR_MAX; max = TRE_CHAR_MAX;
} s += len+2;
else } else {
{
if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
&& ctx->re != re)
/* Two ranges are not allowed to share and endpoint. */
status = REG_ERANGE;
min = max = wc; min = max = wc;
re += clen; s += len;
if (*s == '-' && s[1] != ']') {
s++;
len = mbtowc(&wc, s, -1);
max = wc;
/* XXX - Should use collation order instead of
encoding values in character ranges. */
if (len <= 0 || min > max)
return REG_ERANGE;
s += len;
} }
if (status != REG_OK)
break;
if (class && negate)
if (*num_neg_classes >= MAX_NEG_CLASSES)
status = REG_ESPACE;
else
neg_classes[(*num_neg_classes)++] = class;
else if (!skip)
{
status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
if (status != REG_OK)
break;
((tre_literal_t*)((*items)[i-1])->obj)->class = class;
} }
/* Add opposite-case counterpoints if REG_ICASE is present. if (class && neg->negate) {
This is broken if there are more than two "same" characters. */ if (neg->len >= MAX_NEG_CLASSES)
if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) return REG_ESPACE;
{ neg->a[neg->len++] = class;
tre_cint_t cmin, ccurr; } else {
tre_literal_t *lit = tre_new_lit(ls);
while (min <= max) if (!lit)
{ return REG_ESPACE;
if (tre_islower(min)) lit->code_min = min;
{ lit->code_max = max;
cmin = ccurr = tre_toupper(min++); lit->class = class;
while (tre_islower(min) && tre_toupper(min) == ccurr + 1 lit->position = -1;
&& min <= max)
ccurr = tre_toupper(min++); /* Add opposite-case codepoints if REG_ICASE is present.
status = tre_new_item(ctx->mem, cmin, ccurr, It seems that POSIX requires that bracket negation
&i, &max_i, items); should happen before case-folding, but most practical
} implementations do it the other way around. Changing
else if (tre_isupper(min)) the order would need efficient representation of
{ case-fold ranges and bracket range sets even with
cmin = ccurr = tre_tolower(min++); simple patterns so this is ok for now. */
while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 if (ctx->cflags & REG_ICASE && !class)
&& min <= max) if (add_icase_literals(ls, min, max))
ccurr = tre_tolower(min++); return REG_ESPACE;
status = tre_new_item(ctx->mem, cmin, ccurr,
&i, &max_i, items);
}
else min++;
if (status != REG_OK)
break;
}
if (status != REG_OK)
break;
}
} }
} }
*num_items = i;
*items_size = max_i;
ctx->re = re;
return status;
} }
static reg_errcode_t static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
{ {
tre_ast_node_t *node = NULL; int i, max, min, negmax, negmin;
int negate = 0; tre_ast_node_t *node = 0, *n;
reg_errcode_t status = REG_OK; tre_ctype_t *nc = 0;
tre_ast_node_t **items, *u, *n; tre_literal_t *lit;
int i = 0, j, max_i = 32, curr_max, curr_min; struct literals ls;
tre_ctype_t neg_classes[MAX_NEG_CLASSES]; struct neg neg;
int num_neg_classes = 0; reg_errcode_t err;
/* Start off with an array of `max_i' elements. */ ls.mem = ctx->mem;
items = xmalloc(sizeof(*items) * max_i); ls.len = 0;
if (items == NULL) ls.cap = 32;
ls.a = xmalloc(ls.cap * sizeof *ls.a);
if (!ls.a)
return REG_ESPACE; return REG_ESPACE;
neg.len = 0;
neg.negate = *s == '^';
if (neg.negate)
s++;
if (*ctx->re == CHAR_CARET) err = parse_bracket_terms(ctx, s, &ls, &neg);
{ if (err != REG_OK)
negate = 1;
ctx->re++;
}
status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
&items, &i, &max_i);
if (status != REG_OK)
goto parse_bracket_done; goto parse_bracket_done;
if (neg.negate) {
/* Sort the array if we need to negate it. */ /* Sort the array if we need to negate it. */
if (negate) qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); /* extra lit for the last negated range */
lit = tre_new_lit(&ls);
curr_max = curr_min = 0; if (!lit) {
/* Build a union of the items in the array, negated if necessary. */ err = REG_ESPACE;
for (j = 0; j < i && status == REG_OK; j++) goto parse_bracket_done;
{
int min, max;
tre_literal_t *l = items[j]->obj;
min = l->code_min;
max = l->code_max;
if (negate)
{
if (min < curr_max)
{
/* Overlap. */
curr_max = MAX(max + 1, curr_max);
l = NULL;
}
else
{
/* No overlap. */
curr_max = min - 1;
if (curr_max >= curr_min)
{
l->code_min = curr_min;
l->code_max = curr_max;
}
else
{
l = NULL;
}
curr_min = curr_max = max + 1;
}
}
if (l != NULL)
{
int k;
l->position = ctx->position;
if (num_neg_classes > 0)
{
l->neg_classes = tre_mem_alloc(ctx->mem,
(sizeof(*l->neg_classes)
* (num_neg_classes + 1)));
if (l->neg_classes == NULL)
{
status = REG_ESPACE;
break;
}
for (k = 0; k < num_neg_classes; k++)
l->neg_classes[k] = neg_classes[k];
l->neg_classes[k] = (tre_ctype_t)0;
} }
else lit->code_min = TRE_CHAR_MAX+1;
l->neg_classes = NULL; lit->code_max = TRE_CHAR_MAX+1;
if (node == NULL) lit->position = -1;
node = items[j]; /* negated classes */
else if (neg.len) {
{ nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
u = tre_ast_new_union(ctx->mem, node, items[j]); if (!nc) {
if (u == NULL) err = REG_ESPACE;
status = REG_ESPACE; goto parse_bracket_done;
node = u;
} }
memcpy(nc, neg.a, neg.len*sizeof *neg.a);
nc[neg.len] = 0;
} }
} }
if (status != REG_OK) /* Build a union of the items in the array, negated if necessary. */
goto parse_bracket_done; negmax = negmin = 0;
for (i = 0; i < ls.len; i++) {
if (negate) lit = ls.a[i];
{ min = lit->code_min;
int k; max = lit->code_max;
n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); if (neg.negate) {
if (n == NULL) if (min <= negmin) {
status = REG_ESPACE; /* Overlap. */
else negmin = MAX(max + 1, negmin);
{ continue;
tre_literal_t *l = n->obj;
if (num_neg_classes > 0)
{
l->neg_classes = tre_mem_alloc(ctx->mem,
(sizeof(*l->neg_classes)
* (num_neg_classes + 1)));
if (l->neg_classes == NULL)
{
status = REG_ESPACE;
goto parse_bracket_done;
} }
for (k = 0; k < num_neg_classes; k++) negmax = min - 1;
l->neg_classes[k] = neg_classes[k]; lit->code_min = negmin;
l->neg_classes[k] = (tre_ctype_t)0; lit->code_max = negmax;
} negmin = max + 1;
else
l->neg_classes = NULL;
if (node == NULL)
node = n;
else
{
u = tre_ast_new_union(ctx->mem, node, n);
if (u == NULL)
status = REG_ESPACE;
node = u;
} }
lit->position = ctx->position;
lit->neg_classes = nc;
n = tre_ast_new_node(ctx->mem, LITERAL, lit);
node = tre_ast_new_union(ctx->mem, node, n);
if (!node) {
err = REG_ESPACE;
break;
} }
} }
if (status != REG_OK) parse_bracket_done:
goto parse_bracket_done; xfree(ls.a);
#ifdef TRE_DEBUG
tre_ast_print(node);
#endif /* TRE_DEBUG */
parse_bracket_done:
xfree(items);
ctx->position++; ctx->position++;
*result = node; ctx->n = node;
return status; return err;
} }
static const char *parse_dup_count(const char *s, int *n)
/* Parses a positive decimal integer. Returns -1 if the string does not
contain a valid number. */
static int
tre_parse_int(const char **regex)
{ {
int num = -1; *n = -1;
const char *r = *regex; if (!isdigit(*s))
while (*r-'0'<10U) return s;
{ *n = 0;
if (num < 0) for (;;) {
num = 0; *n = 10 * *n + (*s - '0');
num = num * 10 + *r - '0'; s++;
r++; if (!isdigit(*s) || *n > RE_DUP_MAX)
break;
} }
*regex = r; return s;
return num;
} }
static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
{ {
int min, max; int min, max;
const char *r = ctx->re;
int minimal = 0;
/* Parse number (minimum repetition count). */
min = -1;
if (*r >= '0' && *r <= '9') {
min = tre_parse_int(&r);
}
/* Parse comma and second number (maximum repetition count). */ s = parse_dup_count(s, &min);
if (*s == ',')
s = parse_dup_count(s+1, &max);
else
max = min; max = min;
if (*r == CHAR_COMMA)
{
r++;
max = tre_parse_int(&r);
}
/* Check that the repeat counts are sane. */ if (
if ((max >= 0 && min > max) || max > RE_DUP_MAX) (max < min && max >= 0) ||
max > RE_DUP_MAX ||
min > RE_DUP_MAX ||
min < 0 ||
(!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
*s++ != '}'
)
return REG_BADBR; return REG_BADBR;
/* Missing }. */
if (!*r)
return REG_EBRACE;
/* Empty contents of {}. */
if (r == ctx->re)
return REG_BADBR;
/* Parse the ending '}' or '\}'.*/
if (ctx->cflags & REG_EXTENDED)
{
if (*r != CHAR_RBRACE)
return REG_BADBR;
r++;
}
else
{
if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
return REG_BADBR;
r += 2;
}
/* Create the AST node(s). */
if (min == 0 && max == 0) if (min == 0 && max == 0)
{ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
*result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (*result == NULL)
return REG_ESPACE;
}
else else
{ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
if (min < 0 && max < 0) if (!ctx->n)
/* Only approximate parameters set, no repetitions. */
min = max = 1;
*result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
if (!*result)
return REG_ESPACE; return REG_ESPACE;
} ctx->s = s;
ctx->re = r;
return REG_OK; return REG_OK;
} }
typedef enum { static int hexval(unsigned c)
PARSE_RE = 0,
PARSE_ATOM,
PARSE_MARK_FOR_SUBMATCH,
PARSE_BRANCH,
PARSE_PIECE,
PARSE_CATENATION,
PARSE_POST_CATENATION,
PARSE_UNION,
PARSE_POST_UNION,
PARSE_POSTFIX,
PARSE_RESTORE_CFLAGS
} tre_parse_re_stack_symbol_t;
static reg_errcode_t
tre_parse(tre_parse_ctx_t *ctx)
{ {
tre_ast_node_t *result = NULL; if (c-'0'<10) return c-'0';
tre_parse_re_stack_symbol_t symbol; c |= 32;
reg_errcode_t status = REG_OK; if (c-'a'<6) return c-'a'+10;
tre_stack_t *stack = ctx->stack; return -1;
int bottom = tre_stack_num_objects(stack); }
int depth = 0;
wchar_t wc;
int clen;
if (!ctx->nofirstsub)
{
STACK_PUSH(stack, int, ctx->submatch_id);
STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
ctx->submatch_id++;
}
STACK_PUSH(stack, int, PARSE_RE);
ctx->re_start = ctx->re;
/* The following is basically just a recursive descent parser. I use
an explicit stack instead of recursive functions mostly because of
two reasons: compatibility with systems which have an overflowable
call stack, and efficiency (both in lines of code and speed). */
while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
{
if (status != REG_OK)
break;
symbol = tre_stack_pop_int(stack);
switch (symbol)
{
case PARSE_RE:
/* Parse a full regexp. A regexp is one or more branches,
separated by the union operator `|'. */
if (ctx->cflags & REG_EXTENDED)
STACK_PUSHX(stack, int, PARSE_UNION);
STACK_PUSHX(stack, int, PARSE_BRANCH);
break;
case PARSE_BRANCH:
/* Parse a branch. A branch is one or more pieces, concatenated.
A piece is an atom possibly followed by a postfix operator. */
STACK_PUSHX(stack, int, PARSE_CATENATION);
STACK_PUSHX(stack, int, PARSE_PIECE);
break;
case PARSE_PIECE:
/* Parse a piece. A piece is an atom possibly followed by one
or more postfix operators. */
STACK_PUSHX(stack, int, PARSE_POSTFIX);
STACK_PUSHX(stack, int, PARSE_ATOM);
break;
case PARSE_CATENATION:
/* If the expression has not ended, parse another piece. */
{
tre_char_t c;
if (!*ctx->re)
break;
c = *ctx->re;
if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
break;
if ((ctx->cflags & REG_EXTENDED
&& c == CHAR_RPAREN && depth > 0)
|| (!(ctx->cflags & REG_EXTENDED)
&& (c == CHAR_BACKSLASH
&& *(ctx->re + 1) == CHAR_RPAREN)))
{
if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
status = REG_EPAREN;
depth--;
if (!(ctx->cflags & REG_EXTENDED))
ctx->re += 2;
break;
}
{
/* Default case, left associative concatenation. */
STACK_PUSHX(stack, int, PARSE_CATENATION);
STACK_PUSHX(stack, voidptr, result);
STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
STACK_PUSHX(stack, int, PARSE_PIECE);
}
break;
}
case PARSE_POST_CATENATION:
{
tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
tre_ast_node_t *tmp_node;
tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
if (!tmp_node)
return REG_ESPACE;
result = tmp_node;
break;
}
case PARSE_UNION:
switch (*ctx->re)
{
case CHAR_PIPE:
STACK_PUSHX(stack, int, PARSE_UNION);
STACK_PUSHX(stack, voidptr, result);
STACK_PUSHX(stack, int, PARSE_POST_UNION);
STACK_PUSHX(stack, int, PARSE_BRANCH);
ctx->re++;
break;
case CHAR_RPAREN:
ctx->re++;
break;
default:
break;
}
break;
case PARSE_POST_UNION: static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
{ {
tre_ast_node_t *tmp_node; if (node->submatch_id >= 0) {
tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
tmp_node = tre_ast_new_union(ctx->mem, tree, result); if (!n)
if (!tmp_node)
return REG_ESPACE; return REG_ESPACE;
result = tmp_node; n = tre_ast_new_catenation(ctx->mem, n, node);
break; if (!n)
}
case PARSE_POSTFIX:
/* Parse postfix operators. */
switch (*ctx->re)
{
case CHAR_PLUS:
case CHAR_QUESTIONMARK:
if (!(ctx->cflags & REG_EXTENDED))
break;
/*FALLTHROUGH*/
case CHAR_STAR:
{
tre_ast_node_t *tmp_node;
int minimal = 0;
int rep_min = 0;
int rep_max = -1;
if (*ctx->re == CHAR_PLUS)
rep_min = 1;
if (*ctx->re == CHAR_QUESTIONMARK)
rep_max = 1;
ctx->re++;
tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
minimal);
if (tmp_node == NULL)
return REG_ESPACE; return REG_ESPACE;
result = tmp_node; n->num_submatches = node->num_submatches;
STACK_PUSHX(stack, int, PARSE_POSTFIX); node = n;
}
break;
case CHAR_BACKSLASH:
/* "\{" is special without REG_EXTENDED */
if (!(ctx->cflags & REG_EXTENDED)
&& *(ctx->re + 1) == CHAR_LBRACE)
{
ctx->re++;
goto parse_brace;
}
else
break;
case CHAR_LBRACE:
/* "{" is literal without REG_EXTENDED */
if (!(ctx->cflags & REG_EXTENDED))
break;
parse_brace:
ctx->re++;
status = tre_parse_bound(ctx, &result);
if (status != REG_OK)
return status;
STACK_PUSHX(stack, int, PARSE_POSTFIX);
break;
} }
break; node->submatch_id = subid;
node->num_submatches++;
ctx->n = node;
return REG_OK;
}
case PARSE_ATOM: /*
/* Parse an atom. An atom is a regular expression enclosed in `()', BRE grammar:
an empty set of `()', a bracket expression, `.', `^', `$', Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$'
a `\' followed by a character, or a single character. */ Branch = Atom | Branch Atom
Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref
Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}'
switch (*ctx->re) (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
{
case CHAR_LPAREN: /* parenthesized subexpression */
if (ctx->cflags & REG_EXTENDED) ERE grammar:
{ Regex = Branch | Regex '|' Branch
lparen: Branch = Atom | Branch Atom
depth++; Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$'
{ Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}'
ctx->re++;
/* First parse a whole RE, then mark the resulting tree
for submatching. */
STACK_PUSHX(stack, int, ctx->submatch_id);
STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
STACK_PUSHX(stack, int, PARSE_RE);
ctx->submatch_id++;
}
}
else
goto parse_literal;
break;
case CHAR_LBRACKET: /* bracket expression */ (a*+?, ^*, $+, \X, {, (|a) are unspecified)
ctx->re++; */
status = tre_parse_bracket(ctx, &result);
if (status != REG_OK)
return status;
break;
case CHAR_BACKSLASH:
/* If this is "\(" or "\)" chew off the backslash and
try again. */
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
{
ctx->re++;
goto lparen;
}
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
{
goto empty_atom;
}
/* If a macro is used, parse the expanded macro recursively. */
{
const char *buf = tre_expand_macro(ctx->re + 1);
if (buf)
{
tre_parse_ctx_t subctx;
memcpy(&subctx, ctx, sizeof(subctx));
subctx.re = buf;
subctx.nofirstsub = 1;
status = tre_parse(&subctx);
if (status != REG_OK)
return status;
ctx->re += 2;
ctx->position = subctx.position;
result = subctx.result;
break;
}
}
if (!ctx->re[1]) static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
/* Trailing backslash. */ {
int len, ere = ctx->cflags & REG_EXTENDED;
const char *p;
tre_ast_node_t *node;
wchar_t wc;
switch (*s) {
case '[':
return parse_bracket(ctx, s+1);
case '\\':
p = tre_expand_macro(s+1);
if (p) {
/* assume \X expansion is a single atom */
reg_errcode_t err = parse_atom(ctx, p);
ctx->s = s+2;
return err;
}
/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
switch (*++s) {
case 0:
return REG_EESCAPE; return REG_EESCAPE;
ctx->re++;
switch (*ctx->re)
{
case 'b': case 'b':
result = tre_ast_new_literal(ctx->mem, ASSERTION, node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
ASSERT_AT_WB, -1);
ctx->re++;
break; break;
case 'B': case 'B':
result = tre_ast_new_literal(ctx->mem, ASSERTION, node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
ASSERT_AT_WB_NEG, -1);
ctx->re++;
break; break;
case '<': case '<':
result = tre_ast_new_literal(ctx->mem, ASSERTION, node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
ASSERT_AT_BOW, -1);
ctx->re++;
break; break;
case '>': case '>':
result = tre_ast_new_literal(ctx->mem, ASSERTION, node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
ASSERT_AT_EOW, -1);
ctx->re++;
break; break;
case 'x': case 'x':
ctx->re++; s++;
if (ctx->re[0] != CHAR_LBRACE) int i, v = 0, c;
{ len = 2;
/* 8 bit hex char. */ if (*s == '{') {
char tmp[3] = {0, 0, 0}; len = 8;
long val; s++;
}
if (tre_isxdigit(ctx->re[0])) for (i=0; i<len && v<0x110000; i++) {
{ c = hexval(s[i]);
tmp[0] = (char)ctx->re[0]; if (c < 0) break;
ctx->re++; v = 16*v + c;
} }
if (tre_isxdigit(ctx->re[0])) s += i;
{ if (len == 8) {
tmp[1] = (char)ctx->re[0]; if (*s != '}')
ctx->re++;
}
val = strtol(tmp, NULL, 16);
result = tre_ast_new_literal(ctx->mem, (int)val,
(int)val, ctx->position);
ctx->position++;
break;
}
else if (*ctx->re)
{
/* Wide char. */
char tmp[32];
long val;
int i = 0;
ctx->re++;
while (*ctx->re && i < sizeof tmp)
{
if (ctx->re[0] == CHAR_RBRACE)
break;
if (tre_isxdigit(ctx->re[0]))
{
tmp[i] = (char)ctx->re[0];
i++;
ctx->re++;
continue;
}
return REG_EBRACE; return REG_EBRACE;
s++;
} }
ctx->re++; node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
tmp[i] = 0;
val = strtol(tmp, NULL, 16);
result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
ctx->position);
ctx->position++; ctx->position++;
s--;
break; break;
}
/*FALLTHROUGH*/
default: default:
if (tre_isdigit(*ctx->re)) if (isdigit(*s)) {
{ /* back reference */
/* Back reference. */ int val = *s - '0';
int val = *ctx->re - '0'; node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);
result = tre_ast_new_literal(ctx->mem, BACKREF, val,
ctx->position);
if (result == NULL)
return REG_ESPACE;
ctx->position++;
ctx->max_backref = MAX(val, ctx->max_backref); ctx->max_backref = MAX(val, ctx->max_backref);
ctx->re++; } else {
/* extension: accept unknown escaped char
as a literal */
node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position);
} }
else
{
/* Escaped character. */
result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
ctx->position);
ctx->position++; ctx->position++;
ctx->re++;
} }
s++;
break; break;
} case '.':
if (result == NULL) if (ctx->cflags & REG_NEWLINE) {
return REG_ESPACE; tre_ast_node_t *tmp1, *tmp2;
break; tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
case CHAR_PERIOD: /* the any-symbol */ if (tmp1 && tmp2)
if (ctx->cflags & REG_NEWLINE) node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
{
tre_ast_node_t *tmp1;
tre_ast_node_t *tmp2;
tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
ctx->position);
if (!tmp1)
return REG_ESPACE;
tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
ctx->position + 1);
if (!tmp2)
return REG_ESPACE;
result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
if (!result)
return REG_ESPACE;
ctx->position += 2;
}
else else
{ node = 0;
result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, } else {
ctx->position); node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
if (!result)
return REG_ESPACE;
ctx->position++;
} }
ctx->re++; s++;
break; break;
case '^':
case CHAR_CARET: /* beginning of line assertion */ /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
/* '^' has a special meaning everywhere in EREs, and at if (!ere && s != ctx->re)
beginning of BRE. */
if (ctx->cflags & REG_EXTENDED
|| ctx->re == ctx->re_start)
{
if (!(ctx->cflags & REG_EXTENDED))
STACK_PUSHX(stack, int, PARSE_CATENATION);
result = tre_ast_new_literal(ctx->mem, ASSERTION,
ASSERT_AT_BOL, -1);
if (result == NULL)
return REG_ESPACE;
ctx->re++;
}
else
goto parse_literal;
break;
case CHAR_DOLLAR: /* end of line assertion. */
/* '$' is special everywhere in EREs, and in the end of the
string in BREs. */
if (ctx->cflags & REG_EXTENDED
|| !*(ctx->re + 1))
{
result = tre_ast_new_literal(ctx->mem, ASSERTION,
ASSERT_AT_EOL, -1);
if (result == NULL)
return REG_ESPACE;
ctx->re++;
}
else
goto parse_literal; goto parse_literal;
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
s++;
break; break;
case '$':
case CHAR_RPAREN: /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
if (!depth) if (!ere && s[1])
goto parse_literal; goto parse_literal;
case CHAR_STAR: node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
case CHAR_PIPE: s++;
case CHAR_LBRACE: break;
case CHAR_PLUS: case '*':
case CHAR_QUESTIONMARK: case '|':
if (!(ctx->cflags & REG_EXTENDED)) case '{':
case '+':
case '?':
if (!ere)
goto parse_literal; goto parse_literal;
case 0: case 0:
empty_atom: node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!result)
return REG_ESPACE;
break; break;
default: default:
parse_literal: parse_literal:
len = mbtowc(&wc, s, -1);
clen = mbtowc(&wc, ctx->re, -1); if (len < 0)
if (clen<0) clen=1, wc=WEOF; return REG_BADPAT;
if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
/* Note that we can't use an tre_isalpha() test here, since there tre_ast_node_t *tmp1, *tmp2;
may be characters which are alphabetic but neither upper or /* multiple opposite case characters are not supported */
lower case. */ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
if (ctx->cflags & REG_ICASE tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
&& (tre_isupper(wc) || tre_islower(wc))) if (tmp1 && tmp2)
{ node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
tre_ast_node_t *tmp1;
tre_ast_node_t *tmp2;
/* XXX - Can there be more than one opposite-case
counterpoints for some character in some locale? Or
more than two characters which all should be regarded
the same character if case is ignored? If yes, there
does not seem to be a portable way to detect it. I guess
that at least for multi-character collating elements there
could be several opposite-case counterpoints, but they
cannot be supported portably anyway. */
tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
tre_toupper(wc),
ctx->position);
if (!tmp1)
return REG_ESPACE;
tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
tre_tolower(wc),
ctx->position);
if (!tmp2)
return REG_ESPACE;
result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
if (!result)
return REG_ESPACE;
}
else else
{ node = 0;
result = tre_ast_new_literal(ctx->mem, wc, wc, } else {
ctx->position); node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
if (!result)
return REG_ESPACE;
} }
ctx->position++; ctx->position++;
ctx->re += clen; s += len;
break; break;
} }
break; if (!node)
return REG_ESPACE;
ctx->n = node;
ctx->s = s;
return REG_OK;
}
case PARSE_MARK_FOR_SUBMATCH: #define PUSHPTR(err, s, v) do { \
{ if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
int submatch_id = tre_stack_pop_int(stack); return err; \
} while(0)
if (result->submatch_id >= 0) #define PUSHINT(err, s, v) do { \
{ if ((err = tre_stack_push_int(s, v)) != REG_OK) \
tre_ast_node_t *n, *tmp_node; return err; \
n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); } while(0)
if (n == NULL)
static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
{
tre_ast_node_t *nbranch=0, *nunion=0;
int ere = ctx->cflags & REG_EXTENDED;
const char *s = ctx->re;
int subid = 0;
int depth = 0;
reg_errcode_t err;
tre_stack_t *stack = ctx->stack;
PUSHINT(err, stack, subid++);
for (;;) {
if ((!ere && *s == '\\' && s[1] == '(') ||
(ere && *s == '(')) {
PUSHPTR(err, stack, nunion);
PUSHPTR(err, stack, nbranch);
PUSHINT(err, stack, subid++);
s++;
if (!ere)
s++;
depth++;
nbranch = nunion = 0;
continue;
}
if ((!ere && *s == '\\' && s[1] == ')') ||
(ere && *s == ')' && depth)) {
ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!ctx->n)
return REG_ESPACE; return REG_ESPACE;
tmp_node = tre_ast_new_catenation(ctx->mem, n, result); } else {
if (tmp_node == NULL) err = parse_atom(ctx, s);
if (err != REG_OK)
return err;
s = ctx->s;
}
parse_iter:
/* extension: repetitions are accepted after an empty node
eg. (+), ^*, a$?, a|{2} */
switch (*s) {
case '+':
case '?':
if (!ere)
break;
/* fallthrough */
case '*':;
int min=0, max=-1;
if (*s == '+')
min = 1;
if (*s == '?')
max = 1;
s++;
ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
if (!ctx->n)
return REG_ESPACE; return REG_ESPACE;
tmp_node->num_submatches = result->num_submatches; /* extension: multiple consecutive *+?{,} is unspecified,
result = tmp_node; but (a+)+ has to be supported so accepting a++ makes
} sense, note however that the RE_DUP_MAX limit can be
result->submatch_id = submatch_id; circumvented: (a{255}){255} uses a lot of memory.. */
result->num_submatches++; goto parse_iter;
case '\\':
if (ere || s[1] != '{')
break;
s++;
goto parse_brace;
case '{':
if (!ere)
break; break;
parse_brace:
err = parse_dup(ctx, s+1);
if (err != REG_OK)
return err;
s = ctx->s;
goto parse_iter;
}
nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
if ((ere && *s == '|') ||
(ere && *s == ')' && depth) ||
(!ere && *s == '\\' && s[1] == ')') ||
!*s) {
/* extension: empty branch is unspecified (), (|a), (a|)
here they are not rejected but match on empty string */
int c = *s;
nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
nbranch = 0;
if (c != '|') {
if (c == '\\') {
if (!depth) return REG_EPAREN;
s+=2;
} else if (c == ')')
s++;
depth--;
err = marksub(ctx, nunion, tre_stack_pop_int(stack));
if (err != REG_OK)
return err;
if (!c && depth<0) {
ctx->submatch_id = subid;
return REG_OK;
} }
if (!c || depth<0)
case PARSE_RESTORE_CFLAGS: return REG_EPAREN;
ctx->cflags = tre_stack_pop_int(stack); nbranch = tre_stack_pop_voidptr(stack);
break; nunion = tre_stack_pop_voidptr(stack);
goto parse_iter;
default: }
assert(0); s++;
break;
} }
} }
/* Check for missing closing parentheses. */
if (depth > 0)
return REG_EPAREN;
if (status == REG_OK)
ctx->result = result;
return status;
} }
/*********************************************************************** /***********************************************************************
from tre-compile.c from tre-compile.c
***********************************************************************/ ***********************************************************************/
...@@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) ...@@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
if (errcode != REG_OK) if (errcode != REG_OK)
ERROR_EXIT(errcode); ERROR_EXIT(errcode);
preg->re_nsub = parse_ctx.submatch_id - 1; preg->re_nsub = parse_ctx.submatch_id - 1;
tree = parse_ctx.result; tree = parse_ctx.n;
/* Back references and approximate matching cannot currently be used
in the same regexp. */
if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
ERROR_EXIT(REG_BADPAT);
#ifdef TRE_DEBUG #ifdef TRE_DEBUG
tre_ast_print(tree); tre_ast_print(tree);
...@@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) ...@@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
if (tnfa == NULL) if (tnfa == NULL)
ERROR_EXIT(REG_ESPACE); ERROR_EXIT(REG_ESPACE);
tnfa->have_backrefs = parse_ctx.max_backref >= 0; tnfa->have_backrefs = parse_ctx.max_backref >= 0;
tnfa->have_approx = parse_ctx.have_approx; tnfa->have_approx = 0;
tnfa->num_submatches = parse_ctx.submatch_id; tnfa->num_submatches = parse_ctx.submatch_id;
/* Set up tags for submatch addressing. If REG_NOSUB is set and the /* Set up tags for submatch addressing. If REG_NOSUB is set and the
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册