提交 173e29aa 编写于 作者: T Tom Lane

Fix the general case of quantified regex back-references.

Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)

To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes.  Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
上级 0c9e5d5e
......@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
either plain regular expressions (which are executed as DFAs in the manner
described above) or back-references (which try to match the input to some
previous substring). Non-leaf nodes are capture nodes (which save the
location of the substring currently matching their child node) or
concatenation or alternation nodes. At execution time, the executor
recursively scans the tree. At concatenation or alternation nodes,
it considers each possible alternative way of matching the input string,
ie each place where the string could be split for a concatenation, or each
child node for an alternation. It tries the next alternative if the match
fails according to the child nodes. This is exactly the sort of
backtracking search done by a traditional NFA regex engine. If there are
many tree levels it can get very slow.
location of the substring currently matching their child node),
concatenation, alternation, or iteration nodes. At execution time, the
executor recursively scans the tree. At concatenation, alternation, or
iteration nodes, it considers each possible alternative way of matching the
input string, that is each place where the string could be split for a
concatenation or iteration, or each child node for an alternation. It
tries the next alternative if the match fails according to the child nodes.
This is exactly the sort of backtracking search done by a traditional NFA
regex engine. If there are many tree levels it can get very slow.
But all is not lost: we can still be smarter than the average pure NFA
engine. To do this, each subre node has an associated DFA, which
......
......@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
/*----------
* Prepare a general-purpose state skeleton.
*
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
* / /
* [lp] ----> [s2] ----bypass---------------------
* In the no-backrefs case, we want this:
*
* where bypass is an empty, and prefix is some repetitions of atom
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
*
* where prefix is some repetitions of atom. In the general case we need
*
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
*
* where the iterator wraps around [begin] ---atom---> [end]
*
* We make the s state here for both cases; s2 is made below if needed
*----------
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
......@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
NOERR();
atom->begin = s;
atom->end = s2;
s = newstate(v->nfa); /* and spots for prefix and bypass */
s2 = newstate(v->nfa);
s = newstate(v->nfa); /* set up starting state */
NOERR();
EMPTYARC(lp, s);
EMPTYARC(lp, s2);
NOERR();
/* break remaining subRE into x{...} and what follows */
......@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
}
/*
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal
* with quantifiers internally. Otherwise, the first step is to turn
* x{0,...} into x{1,...}|empty
* It's quantifier time. If the atom is just a backref, we'll let it deal
* with quantifiers internally.
*/
if (m == 0 && atomtype != BACKREF)
{
EMPTYARC(s2, atom->end); /* the bypass */
assert(PREF(qprefer) != 0);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '|', f, lp, atom->end);
NOERR();
t->left = atom;
t->right = subre(v, '|', PREF(f), s2, atom->end);
NOERR();
t->right->left = subre(v, '=', 0, s2, atom->end);
NOERR();
*atomp = t;
atomp = &t->left;
m = 1;
}
/* deal with the rest of the quantifier */
if (atomtype == BACKREF)
{
/* special case: backrefs have internal quantifiers */
......@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
atom->min = (short) m;
atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags);
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else if (m == 1 && n == 1)
{
/* no/vacuous quantifier: done */
EMPTYARC(s, atom->begin); /* empty prefix */
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else
else if (m > 0 && !(atom->flags & BACKR))
{
/*
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
* second x
* If there's no backrefs involved, we can turn x{m,n} into
* x{m-1,n-1}x, with capturing parens in only the second x. This
* is valid because we only care about capturing matches from the
* final iteration of the quantifier. It's a win because we can
* implement the backref-free left side as a plain DFA node, since
* we don't really care where its submatches are.
*/
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
assert(m >= 1 && m != INFINITY && n >= 1);
......@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
NOERR();
t->right = atom;
*atomp = t;
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else
{
/* general case: need an iteration node */
s2 = newstate(v->nfa);
NOERR();
moveouts(v->nfa, atom->end, s2);
NOERR();
dupnfa(v->nfa, atom->begin, atom->end, s, s2);
repeat(v, s, s2, m, n);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '*', f, s, s2);
NOERR();
t->min = (short) m;
t->max = (short) n;
t->left = atom;
*atomp = t;
/* rest of branch is to be strung from iteration's end state */
}
/* and finally, look after that postponed recursion */
t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
t->right = parsebranch(v, stopper, type, s2, rp, 1);
else
{
EMPTYARC(atom->end, rp);
t->right = subre(v, '=', 0, atom->end, rp);
EMPTYARC(s2, rp);
t->right = subre(v, '=', 0, s2, rp);
}
assert(SEE('|') || SEE(stopper) || SEE(EOS));
t->flags |= COMBINE(t->flags, t->right->flags);
......@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
/*
* repeat - replicate subNFA for quantifiers
*
* The sub-NFA strung from lp to rp is modified to represent m to n
* repetitions of its initial contents.
*
* The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same
......@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
int n)
{
#define SOME 2
#define INF 3
#define INF 3
#define PAIR(x, y) ((x)*4 + (y))
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
const int rm = REDUCE(m);
......@@ -1603,7 +1619,7 @@ subre(struct vars * v,
v->treechain = ret;
}
assert(strchr("|.b(=", op) != NULL);
assert(strchr("=b|.*(", op) != NULL);
ret->op = op;
ret->flags = flags;
......
此差异已折叠。
......@@ -372,10 +372,28 @@ struct cnfa
/*
* subexpression tree
*
* "op" is one of:
* '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either)
* '(' capture node: captures the match of its single child
* '.' concatenation: matches a match for left, then a match for right
* '|' alternation: matches a match for left or a match for right
* '*' iteration: matches some number of matches of its single child
*
* Note: the right child of an alternation must be another alternation or
* NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
* might expect. This could stand to be changed. Actually I'd rather see
* a single alternation node with N children, but that will take revising
* the representation of struct subre.
*
* Note: when a backref is directly quantified, we stick the min/max counts
* into the backref rather than plastering an iteration node on top. This is
* for efficiency: there is no need to search for possible division points.
*/
struct subre
{
char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */
char op; /* see type codes above */
char flags;
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
......@@ -393,8 +411,8 @@ struct subre
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short retry; /* index into retry memory */
int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions, for backref only */
short max; /* max repetitions, for backref only */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */
......
......@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
t
(1 row)
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
......@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
select 'xxx' ~ '^([bc])\1*$' as f;
select 'bbc' ~ '^([bc])\1*$' as f;
select 'b' ~ '^([bc])\1*$' as t;
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册