未验证 提交 11f02ad3 编写于 作者: O Olli Saarikivi 提交者: GitHub

Two-phase matching algorithm for NonBacktracking (#68199)

* Switch to 2-phase matching in NonBacktracking

First phase now finds the true match end position.
The implicit .* is now a lazy .*? to prioritize the earliest match.
Third phase is now only run for subcaptures, which no longer needs to
find match end position.
Remove counter optimization that no longer applies with OrderedOr.
Fix a problem in SymbolicRegexInfo where begin/end anchors were
marked as line anchors.
Also remove dead fields from SymbolicRegexInfo.
Fix captures not being handled for empty matches at start of input.

* Improve comments for NonBacktracking

Especially fix comments for the new 2-phase match generation algorithm.

* Add a failing test for the earlier NonBacktracking

* Avoid transitions to deadends for capuring NFA
上级 c8be3f3d
...@@ -51,7 +51,7 @@ internal int FixedLength ...@@ -51,7 +51,7 @@ internal int FixedLength
/// <summary>If true then the state is a dead-end, rejects all inputs.</summary> /// <summary>If true then the state is a dead-end, rejects all inputs.</summary>
internal bool IsNothing => Node.IsNothing; internal bool IsNothing => Node.IsNothing;
/// <summary>If true then state starts with a ^ or $ or \A or \z or \Z</summary> /// <summary>If true then state starts with a ^ or $ or \Z</summary>
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor; internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
/// <summary> /// <summary>
...@@ -134,7 +134,9 @@ internal List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)> NfaNex ...@@ -134,7 +134,9 @@ internal List<(DfaMatchingState<TSet> State, DerivativeEffect[] Effects)> NfaNex
// nextCharKind will be the PrevCharKind of the target state // nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already // use an existing state instead if one exists already
// otherwise create a new new id for it // otherwise create a new new id for it
list.Add((Node._builder.CreateState(node, nextCharKind, capturing: true), effects)); DfaMatchingState<TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
if (!state.IsDeadend)
list.Add((state, effects));
} }
return list; return list;
} }
......
...@@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher<TSet> srm, bool nfa, bool addDotStar, ...@@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher<TSet> srm, bool nfa, bool addDotStar,
{ {
_builder = srm._builder; _builder = srm._builder;
uint startId = reverse ? uint startId = reverse ?
(srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) : (srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) :
(srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0); (srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0);
// Create the initial state // Create the initial state
_initialState = _builder.CreateState( _initialState = _builder.CreateState(
......
...@@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet> ...@@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>
internal readonly SymbolicRegexNode<TSet> _nothing; internal readonly SymbolicRegexNode<TSet> _nothing;
internal readonly SymbolicRegexNode<TSet> _anyChar; internal readonly SymbolicRegexNode<TSet> _anyChar;
internal readonly SymbolicRegexNode<TSet> _anyStar; internal readonly SymbolicRegexNode<TSet> _anyStar;
internal readonly SymbolicRegexNode<TSet> _anyStarLazy;
private SymbolicRegexNode<TSet>? _epsilon; private SymbolicRegexNode<TSet>? _epsilon;
internal SymbolicRegexNode<TSet> Epsilon => _epsilon ??= SymbolicRegexNode<TSet>.CreateEpsilon(this); internal SymbolicRegexNode<TSet> Epsilon => _epsilon ??= SymbolicRegexNode<TSet>.CreateEpsilon(this);
...@@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver) ...@@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
_nothing = SymbolicRegexNode<TSet>.CreateFalse(this); _nothing = SymbolicRegexNode<TSet>.CreateFalse(this);
_anyChar = SymbolicRegexNode<TSet>.CreateTrue(this); _anyChar = SymbolicRegexNode<TSet>.CreateTrue(this);
_anyStar = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false); _anyStar = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false);
_anyStarLazy = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: true);
// --- initialize singletonCache --- // --- initialize singletonCache ---
_singletonCache[_solver.Empty] = _nothing; _singletonCache[_solver.Empty] = _nothing;
......
...@@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic ...@@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint IsLazyMask = 4; private const uint IsLazyMask = 4;
private const uint CanBeNullableMask = 8; private const uint CanBeNullableMask = 8;
private const uint ContainsSomeAnchorMask = 16; private const uint ContainsSomeAnchorMask = 16;
private const uint ContainsLineAnchorMask = 32; private const uint StartsWithSomeAnchorMask = 32;
private const uint ContainsSomeCharacterMask = 64;
private const uint StartsWithBoundaryAnchorMask = 128;
private readonly uint _info; private readonly uint _info;
private SymbolicRegexInfo(uint i) => _info = i; private SymbolicRegexInfo(uint i) => _info = i;
internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false, internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false,
bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false, bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isLazy = true)
bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true)
{ {
uint i = 0; uint i = 0;
...@@ -35,31 +32,21 @@ namespace System.Text.RegularExpressions.Symbolic ...@@ -35,31 +32,21 @@ namespace System.Text.RegularExpressions.Symbolic
} }
} }
if (startsWithLineAnchor || containsLineAnchor || startsWithBoundaryAnchor || containsSomeAnchor) if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
{ {
i |= ContainsSomeAnchorMask; i |= ContainsSomeAnchorMask;
if (startsWithLineAnchor || containsLineAnchor) if (startsWithLineAnchor)
{ {
i |= ContainsLineAnchorMask; i |= StartsWithLineAnchorMask;
if (startsWithLineAnchor)
{
i |= StartsWithLineAnchorMask;
}
} }
if (startsWithBoundaryAnchor) if (startsWithLineAnchor || startsWithSomeAnchor)
{ {
i |= StartsWithBoundaryAnchorMask; i |= StartsWithSomeAnchorMask;
} }
} }
if (containsSomeCharacter)
{
i |= ContainsSomeCharacterMask;
}
if (isLazy) if (isLazy)
{ {
i |= IsLazyMask; i |= IsLazyMask;
...@@ -72,18 +59,12 @@ namespace System.Text.RegularExpressions.Symbolic ...@@ -72,18 +59,12 @@ namespace System.Text.RegularExpressions.Symbolic
public bool CanBeNullable => (_info & CanBeNullableMask) != 0; public bool CanBeNullable => (_info & CanBeNullableMask) != 0;
public bool StartsWithSomeAnchor => (_info & (StartsWithLineAnchorMask | StartsWithBoundaryAnchorMask)) != 0;
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0; public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
public bool StartsWithBoundaryAnchor => (_info & StartsWithBoundaryAnchorMask) != 0; public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0; public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
public bool ContainsSomeCharacter => (_info & ContainsSomeCharacterMask) != 0;
public bool IsLazy => (_info & IsLazyMask) != 0; public bool IsLazy => (_info & IsLazyMask) != 0;
public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos) public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos)
...@@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) ...@@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos)
return new SymbolicRegexInfo(i); return new SymbolicRegexInfo(i);
} }
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) =>
{ Create(
bool isNullable = left_info.IsNullable && right_info.IsNullable; isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
bool canBeNullable = left_info.CanBeNullable && right_info.CanBeNullable; canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
bool isLazy = left_info.IsLazy && right_info.IsLazy; startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
bool startsWithLineAnchor = left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor); containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
bool startsWithBoundaryAnchor = left_info.StartsWithBoundaryAnchor || (left_info.CanBeNullable && right_info.StartsWithBoundaryAnchor); isLazy: left_info.IsLazy && right_info.IsLazy);
bool containsSomeAnchor = left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor;
bool containsLineAnchor = left_info.ContainsLineAnchor || right_info.ContainsLineAnchor;
bool containsSomeCharacter = left_info.ContainsSomeCharacter || right_info.ContainsSomeCharacter;
return Create(isNullable, canBeNullable, startsWithLineAnchor, startsWithBoundaryAnchor, containsSomeAnchor, containsLineAnchor, containsSomeCharacter, isLazy);
}
public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy) public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy)
{ {
...@@ -171,10 +146,7 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound ...@@ -171,10 +146,7 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound
Create(isAlwaysNullable: !info.CanBeNullable, Create(isAlwaysNullable: !info.CanBeNullable,
canBeNullable: !info.IsNullable, canBeNullable: !info.IsNullable,
startsWithLineAnchor: info.StartsWithLineAnchor, startsWithLineAnchor: info.StartsWithLineAnchor,
startsWithBoundaryAnchor: info.StartsWithBoundaryAnchor,
containsSomeAnchor: info.ContainsSomeAnchor, containsSomeAnchor: info.ContainsSomeAnchor,
containsLineAnchor: info.ContainsLineAnchor,
containsSomeCharacter: info.ContainsSomeCharacter,
isLazy: info.IsLazy); isLazy: info.IsLazy);
public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i); public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i);
......
...@@ -382,7 +382,7 @@ public bool IsNothing ...@@ -382,7 +382,7 @@ public bool IsNothing
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, null, SymbolicRegexInfo.Create()); Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode<TSet> CreateTrue(SymbolicRegexBuilder<TSet> builder) => internal static SymbolicRegexNode<TSet> CreateTrue(SymbolicRegexBuilder<TSet> builder) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create(containsSomeCharacter: true)); Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode<TSet> CreateFixedLengthMarker(SymbolicRegexBuilder<TSet> builder, int length) => internal static SymbolicRegexNode<TSet> CreateFixedLengthMarker(SymbolicRegexBuilder<TSet> builder, int length) =>
Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true)); Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true));
...@@ -399,19 +399,22 @@ internal static SymbolicRegexNode<TSet> CreateBeginEndAnchor(SymbolicRegexBuilde ...@@ -399,19 +399,22 @@ internal static SymbolicRegexNode<TSet> CreateBeginEndAnchor(SymbolicRegexBuilde
SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithLineAnchor: true, canBeNullable: true)); return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true,
startsWithLineAnchor: kind is
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor));
} }
internal static SymbolicRegexNode<TSet> CreateBoundaryAnchor(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind) internal static SymbolicRegexNode<TSet> CreateBoundaryAnchor(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNodeKind kind)
{ {
Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor); Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor);
return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithBoundaryAnchor: true, canBeNullable: true)); return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true));
} }
#endregion #endregion
internal static SymbolicRegexNode<TSet> CreateSingleton(SymbolicRegexBuilder<TSet> builder, TSet set) => internal static SymbolicRegexNode<TSet> CreateSingleton(SymbolicRegexBuilder<TSet> builder, TSet set) =>
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create(containsSomeCharacter: !set.Equals(builder._solver.Empty))); Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode<TSet> CreateLoop(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> body, int lower, int upper, bool isLazy) internal static SymbolicRegexNode<TSet> CreateLoop(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> body, int lower, int upper, bool isLazy)
{ {
...@@ -589,40 +592,6 @@ internal static SymbolicRegexNode<TSet> OrderedOr(SymbolicRegexBuilder<TSet> bui ...@@ -589,40 +592,6 @@ internal static SymbolicRegexNode<TSet> OrderedOr(SymbolicRegexBuilder<TSet> bui
Debug.Assert(left._kind != SymbolicRegexNodeKind.OrderedOr); Debug.Assert(left._kind != SymbolicRegexNodeKind.OrderedOr);
Debug.Assert(deduplicated); Debug.Assert(deduplicated);
// Apply the counter subsumption/combining optimization if possible
(SymbolicRegexNode<TSet> loop, SymbolicRegexNode<TSet> rest) = left.FirstCounterInfo();
if (loop != builder._nothing)
{
Debug.Assert(loop._kind == SymbolicRegexNodeKind.Loop && loop._left is not null);
(SymbolicRegexNode<TSet> otherLoop, SymbolicRegexNode<TSet> otherRest) = right.FirstCounterInfo();
if (otherLoop != builder._nothing && rest == otherRest)
{
// Found two adjacent counters with the same continuation, check that the loops are equivalent apart from bounds
// and that the bounds form a contiguous interval. Two integer intervals [x1,x2] and [y1,y2] overlap when
// x1 <= y2 and y1 <= x2. The union of intervals that just touch is still contiguous, e.g. [2,5] and [6,10] make
// [2,10], so the lower bounds are decremented by 1 in the check.
Debug.Assert(otherLoop._kind == SymbolicRegexNodeKind.Loop && otherLoop._left is not null);
if (loop._left == otherLoop._left && loop.IsLazy == otherLoop.IsLazy &&
loop._lower - 1 <= otherLoop._upper && otherLoop._lower - 1 <= loop._upper)
{
// Loops are equivalent apart from bounds, and the union of the bounds is a contiguous interval
// Build a new counter for the union of the ranges
SymbolicRegexNode<TSet> newCounter = CreateConcat(builder, CreateLoop(builder, loop._left,
Math.Min(loop._lower, otherLoop._lower), Math.Max(loop._upper, otherLoop._upper), loop.IsLazy), rest);
if (right._kind == SymbolicRegexNodeKind.OrderedOr)
{
// The right counter came from an or, so include the rest of that or
Debug.Assert(right._right is not null);
return OrderedOr(builder, newCounter, right._right, deduplicated: true);
}
else
{
return newCounter;
}
}
}
}
// Counter optimization did not apply, just build the or // Counter optimization did not apply, just build the or
return Create(builder, SymbolicRegexNodeKind.OrderedOr, left, right, -1, -1, default, null, SymbolicRegexInfo.Or(left._info, right._info)); return Create(builder, SymbolicRegexNodeKind.OrderedOr, left, right, -1, -1, default, null, SymbolicRegexInfo.Or(left._info, right._info));
} }
...@@ -1052,6 +1021,8 @@ internal List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> CreateNfaDerivative ...@@ -1052,6 +1021,8 @@ internal List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> CreateNfaDerivative
private void AddTransitions(TSet elem, uint context, List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> transitions, private void AddTransitions(TSet elem, uint context, List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> transitions,
List<SymbolicRegexNode<TSet>> continuation, Stack<DerivativeEffect>? effects, bool simulateBacktracking) List<SymbolicRegexNode<TSet>> continuation, Stack<DerivativeEffect>? effects, bool simulateBacktracking)
{ {
Debug.Assert(!_builder._solver.IsEmpty(elem), "False element or minterm should not make it into derivative construction.");
// Helper function for concatenating a head node and a list of continuation nodes. The continuation nodes // Helper function for concatenating a head node and a list of continuation nodes. The continuation nodes
// are added in reverse order and the function below uses the list as a stack, so the nodes added to the // are added in reverse order and the function below uses the list as a stack, so the nodes added to the
// stack first end up at the tail of the concatenation. // stack first end up at the tail of the concatenation.
......
...@@ -754,6 +754,9 @@ static IEnumerable<(string Pattern, string Input, RegexOptions Options, int Begi ...@@ -754,6 +754,9 @@ static IEnumerable<(string Pattern, string Input, RegexOptions Options, int Begi
yield return (@".*?\dFo{2}", "This1Foo should 2fOo match", RegexOptions.IgnoreCase, 0, 26, true, "This1Foo"); yield return (@".*?\dFo{2}", "This1Foo should 2fOo match", RegexOptions.IgnoreCase, 0, 26, true, "This1Foo");
yield return (@".*?\dfoo", "1fooThis1FOO should 1foo match", RegexOptions.IgnoreCase, 4, 9, true, "This1FOO"); yield return (@".*?\dfoo", "1fooThis1FOO should 1foo match", RegexOptions.IgnoreCase, 4, 9, true, "This1FOO");
// Earliest match, not match with earliest end
yield return (@".{5}Foo|Bar", "FooBarFoo", RegexOptions.None, 1, 8, true, "ooBarFoo");
if (!RegexHelpers.IsNonBacktracking(engine)) if (!RegexHelpers.IsNonBacktracking(engine))
{ {
// RightToLeft // RightToLeft
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册