未验证 提交 11f02ad3 编写于 作者: O Olli Saarikivi 提交者: GitHub

Two-phase matching algorithm for NonBacktracking (#68199)

* Switch to 2-phase matching in NonBacktracking

First phase now finds the true match end position.
The implicit .* is now a lazy .*? to prioritize the earliest match.
Third phase is now only run for subcaptures, which no longer needs to
find match end position.
Remove counter optimization that no longer applies with OrderedOr.
Fix a problem in SymbolicRegexInfo where begin/end anchors were
marked as line anchors.
Also remove dead fields from SymbolicRegexInfo.
Fix captures not being handled for empty matches at start of input.

* Improve comments for NonBacktracking

Especially fix comments for the new 2-phase match generation algorithm.

* Add a failing test for the earlier NonBacktracking

* Avoid transitions to deadends for capuring NFA
上级 c8be3f3d
......@@ -51,7 +51,7 @@ internal int FixedLength
/// <summary>If true then the state is a dead-end, rejects all inputs.</summary>
internal bool IsNothing => Node.IsNothing;
/// <summary>If true then state starts with a ^ or $ or \A or \z or \Z</summary>
/// <summary>If true then state starts with a ^ or $ or \Z</summary>
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
/// <summary>
......@@ -134,7 +134,9 @@ internal List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)> NfaNex
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
list.Add((Node._builder.CreateState(node, nextCharKind, capturing: true), effects));
DfaMatchingState<TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
if (!state.IsDeadend)
list.Add((state, effects));
}
return list;
}
......
......@@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher<TSet> srm, bool nfa, bool addDotStar,
{
_builder = srm._builder;
uint startId = reverse ?
(srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) :
(srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0);
(srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) :
(srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0);
// Create the initial state
_initialState = _builder.CreateState(
......
......@@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>
internal readonly SymbolicRegexNode<TSet> _nothing;
internal readonly SymbolicRegexNode<TSet> _anyChar;
internal readonly SymbolicRegexNode<TSet> _anyStar;
internal readonly SymbolicRegexNode<TSet> _anyStarLazy;
private SymbolicRegexNode<TSet>? _epsilon;
internal SymbolicRegexNode<TSet> Epsilon => _epsilon ??= SymbolicRegexNode<TSet>.CreateEpsilon(this);
......@@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
_nothing = SymbolicRegexNode<TSet>.CreateFalse(this);
_anyChar = SymbolicRegexNode<TSet>.CreateTrue(this);
_anyStar = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false);
_anyStarLazy = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: true);
// --- initialize singletonCache ---
_singletonCache[_solver.Empty] = _nothing;
......
......@@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint IsLazyMask = 4;
private const uint CanBeNullableMask = 8;
private const uint ContainsSomeAnchorMask = 16;
private const uint ContainsLineAnchorMask = 32;
private const uint ContainsSomeCharacterMask = 64;
private const uint StartsWithBoundaryAnchorMask = 128;
private const uint StartsWithSomeAnchorMask = 32;
private readonly uint _info;
private SymbolicRegexInfo(uint i) => _info = i;
internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false,
bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false,
bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true)
internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false,
bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isLazy = true)
{
uint i = 0;
......@@ -35,31 +32,21 @@ namespace System.Text.RegularExpressions.Symbolic
}
}
if (startsWithLineAnchor || containsLineAnchor || startsWithBoundaryAnchor || containsSomeAnchor)
if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
{
i |= ContainsSomeAnchorMask;
if (startsWithLineAnchor || containsLineAnchor)
{
i |= ContainsLineAnchorMask;
if (startsWithLineAnchor)
{
i |= StartsWithLineAnchorMask;
}
}
if (startsWithBoundaryAnchor)
if (startsWithLineAnchor || startsWithSomeAnchor)
{
i |= StartsWithBoundaryAnchorMask;
i |= StartsWithSomeAnchorMask;
}
}
if (containsSomeCharacter)
{
i |= ContainsSomeCharacterMask;
}
if (isLazy)
{
i |= IsLazyMask;
......@@ -72,18 +59,12 @@ namespace System.Text.RegularExpressions.Symbolic
public bool CanBeNullable => (_info & CanBeNullableMask) != 0;
public bool StartsWithSomeAnchor => (_info & (StartsWithLineAnchorMask | StartsWithBoundaryAnchorMask)) != 0;
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
public bool StartsWithBoundaryAnchor => (_info & StartsWithBoundaryAnchorMask) != 0;
public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
public bool ContainsSomeCharacter => (_info & ContainsSomeCharacterMask) != 0;
public bool IsLazy => (_info & IsLazyMask) != 0;
public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos)
......@@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos)
return new SymbolicRegexInfo(i);
}
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info)
{
bool isNullable = left_info.IsNullable && right_info.IsNullable;
bool canBeNullable = left_info.CanBeNullable && right_info.CanBeNullable;
bool isLazy = left_info.IsLazy && right_info.IsLazy;
bool startsWithLineAnchor = left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor);
bool startsWithBoundaryAnchor = left_info.StartsWithBoundaryAnchor || (left_info.CanBeNullable && right_info.StartsWithBoundaryAnchor);
bool containsSomeAnchor = left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor;
bool containsLineAnchor = left_info.ContainsLineAnchor || right_info.ContainsLineAnchor;
bool containsSomeCharacter = left_info.ContainsSomeCharacter || right_info.ContainsSomeCharacter;
return Create(isNullable, canBeNullable, startsWithLineAnchor, startsWithBoundaryAnchor, containsSomeAnchor, containsLineAnchor, containsSomeCharacter, isLazy);
}
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) =>
Create(
isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isLazy: left_info.IsLazy && right_info.IsLazy);
public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy)
{
......@@ -171,10 +146,7 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound
Create(isAlwaysNullable: !info.CanBeNullable,
canBeNullable: !info.IsNullable,
startsWithLineAnchor: info.StartsWithLineAnchor,
startsWithBoundaryAnchor: info.StartsWithBoundaryAnchor,
containsSomeAnchor: info.ContainsSomeAnchor,
containsLineAnchor: info.ContainsLineAnchor,
containsSomeCharacter: info.ContainsSomeCharacter,
isLazy: info.IsLazy);
public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i);
......
......@@ -382,7 +382,7 @@ public bool IsNothing
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode<TSet> CreateTrue(SymbolicRegexBuilder<TSet> builder) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create(containsSomeCharacter: true));
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode<TSet> CreateFixedLengthMarker(SymbolicRegexBuilder<TSet> builder, int length) =>
Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true));
......@@ -399,19 +399,22 @@ internal static SymbolicRegexNode<TSet> CreateBeginEndAnchor(SymbolicRegexBuilde
SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithLineAnchor: true, canBeNullable: true));
return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true,
startsWithLineAnchor: kind is
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor));
}
internal static SymbolicRegexNode<TSet> CreateBoundaryAnchor(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind)
{
Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor);
return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithBoundaryAnchor: true, canBeNullable: true));
return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true));
}
#endregion
internal static SymbolicRegexNode<TSet> CreateSingleton(SymbolicRegexBuilder<TSet> builder, TSet set) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create(containsSomeCharacter: !set.Equals(builder._solver.Empty)));
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode<TSet> CreateLoop(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> body, int lower, int upper, bool isLazy)
{
......@@ -589,40 +592,6 @@ internal static SymbolicRegexNode<TSet> OrderedOr(SymbolicRegexBuilder<TSet> bui
Debug.Assert(left._kind != SymbolicRegexNodeKind.OrderedOr);
Debug.Assert(deduplicated);
// Apply the counter subsumption/combining optimization if possible
(SymbolicRegexNode<TSet> loop, SymbolicRegexNode<TSet> rest) = left.FirstCounterInfo();
if (loop != builder._nothing)
{
Debug.Assert(loop._kind == SymbolicRegexNodeKind.Loop && loop._left is not null);
(SymbolicRegexNode<TSet> otherLoop, SymbolicRegexNode<TSet> otherRest) = right.FirstCounterInfo();
if (otherLoop != builder._nothing && rest == otherRest)
{
// Found two adjacent counters with the same continuation, check that the loops are equivalent apart from bounds
// and that the bounds form a contiguous interval. Two integer intervals [x1,x2] and [y1,y2] overlap when
// x1 <= y2 and y1 <= x2. The union of intervals that just touch is still contiguous, e.g. [2,5] and [6,10] make
// [2,10], so the lower bounds are decremented by 1 in the check.
Debug.Assert(otherLoop._kind == SymbolicRegexNodeKind.Loop && otherLoop._left is not null);
if (loop._left == otherLoop._left && loop.IsLazy == otherLoop.IsLazy &&
loop._lower - 1 <= otherLoop._upper && otherLoop._lower - 1 <= loop._upper)
{
// Loops are equivalent apart from bounds, and the union of the bounds is a contiguous interval
// Build a new counter for the union of the ranges
SymbolicRegexNode<TSet> newCounter = CreateConcat(builder, CreateLoop(builder, loop._left,
Math.Min(loop._lower, otherLoop._lower), Math.Max(loop._upper, otherLoop._upper), loop.IsLazy), rest);
if (right._kind == SymbolicRegexNodeKind.OrderedOr)
{
// The right counter came from an or, so include the rest of that or
Debug.Assert(right._right is not null);
return OrderedOr(builder, newCounter, right._right, deduplicated: true);
}
else
{
return newCounter;
}
}
}
}
// Counter optimization did not apply, just build the or
return Create(builder, SymbolicRegexNodeKind.OrderedOr, left, right, -1, -1, default, null, SymbolicRegexInfo.Or(left._info, right._info));
}
......@@ -1052,6 +1021,8 @@ internal List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> CreateNfaDerivative
private void AddTransitions(TSet elem, uint context, List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> transitions,
List<SymbolicRegexNode<TSet>> continuation, Stack<DerivativeEffect>? effects, bool simulateBacktracking)
{
Debug.Assert(!_builder._solver.IsEmpty(elem), "False element or minterm should not make it into derivative construction.");
// Helper function for concatenating a head node and a list of continuation nodes. The continuation nodes
// are added in reverse order and the function below uses the list as a stack, so the nodes added to the
// stack first end up at the tail of the concatenation.
......
......@@ -754,6 +754,9 @@ static IEnumerable<(string Pattern, string Input, RegexOptions Options, int Begi
yield return (@".*?\dFo{2}", "This1Foo should 2fOo match", RegexOptions.IgnoreCase, 0, 26, true, "This1Foo");
yield return (@".*?\dfoo", "1fooThis1FOO should 1foo match", RegexOptions.IgnoreCase, 4, 9, true, "This1FOO");
// Earliest match, not match with earliest end
yield return (@".{5}Foo|Bar", "FooBarFoo", RegexOptions.None, 1, 8, true, "ooBarFoo");
if (!RegexHelpers.IsNonBacktracking(engine))
{
// RightToLeft
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册