未验证 提交 520719c9 编写于 作者: S Stephen Toub 提交者: GitHub

Make more Regex loops atomic automatically (#31738)

* Change more Regex loops inside of atomic nodes to be atomic

* Allow Regex loops that end other loops to be made atomic

For example, given an expression like "(abcd*)*e", the inner d* loop will now be converted to be atomic, e.g. "(abc(?>d*))*e)".  We're careful not to convert an expression like "(abca*)*e)", where the inner loop has no overlap with what comes immediately after the loop but does with the start of the loop.

I also fixed what appears to be a long-standing bug in how character classes are canonicalized.  An attempt was made to avoid running the canonlicalization routine, but we almost always need to do it, and we were not doing it in some cases, resulting in character classes that were correct but not as optimized as they should have been, e.g. two abutting ranges when one would suffice, leading to poorer code gen.  It's part of this PR because I caught when changing the reduction tests to validate the full code tree including strings rather than just the codes list.

* Address PR feedback

And add more comments, more code coverage, more runtime asserts, etc.

I also noticed we weren't reducing Setloopatomic and Setlazy into {Not}oneloopatomic and [Not}onelazy, so I fixed that.

And I changed ReduceAtomic to actually remove unnecessary intermediate atomic nodes.
上级 022775ae
......@@ -78,7 +78,7 @@ public static void Validate_InvalidMatchTimeoutInMilliseconds_ThrowsArgumentOutO
[Fact]
public static void Validate_MatchingTimesOut_ThrowsRegexMatchTimeoutException()
{
RegularExpressionAttribute attribute = new RegularExpressionAttribute("(a+)+$") { MatchTimeoutInMilliseconds = 1 };
RegularExpressionAttribute attribute = new RegularExpressionAttribute("(a[ab]+)+$") { MatchTimeoutInMilliseconds = 1 };
Assert.Throws<RegexMatchTimeoutException>(() => attribute.Validate("aaaaaaaaaaaaaaaaaaaaaaaaaaaa>", new ValidationContext(new object())));
}
......
......@@ -341,7 +341,7 @@ public int Scan(string text, int index, int beglimit, int endlimit)
#if DEBUG
/// <summary>Used when dumping for debugging.</summary>
[ExcludeFromCodeCoverage]
public override string ToString() => Pattern;
public override string ToString() => Dump(string.Empty);
[ExcludeFromCodeCoverage]
public string Dump(string indent)
......
......@@ -399,7 +399,6 @@ internal sealed class RegexCharClass
private List<SingleRange>? _rangelist;
private StringBuilder? _categories;
private RegexCharClass? _subtractor;
private bool _canonical = true;
private bool _negate;
#if DEBUG
......@@ -450,15 +449,6 @@ public void AddCharClass(RegexCharClass cc)
int ccRangeCount = cc._rangelist?.Count ?? 0;
if (!cc._canonical || // if the new char class to add isn't canonical, we're not either.
(_canonical &&
ccRangeCount > 0 &&
_rangelist != null && _rangelist.Count > 0 &&
cc._rangelist![0].First <= _rangelist[^1].Last))
{
_canonical = false;
}
if (ccRangeCount != 0)
{
EnsureRangeList().AddRange(cc._rangelist!);
......@@ -488,11 +478,6 @@ private void AddSet(ReadOnlySpan<char> set)
List<SingleRange> rangeList = EnsureRangeList();
if (_canonical && rangeList.Count > 0 && set[0] <= rangeList[^1].Last)
{
_canonical = false;
}
int i;
for (i = 0; i < set.Length - 1; i += 2)
{
......@@ -514,14 +499,8 @@ public void AddSubtraction(RegexCharClass sub)
/// <summary>
/// Adds a single range of characters to the class.
/// </summary>
public void AddRange(char first, char last)
{
public void AddRange(char first, char last) =>
EnsureRangeList().Add(new SingleRange(first, last));
if (_canonical && first <= last)
{
_canonical = false;
}
}
public void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern, int currentPos)
{
......@@ -563,8 +542,6 @@ public void AddCategoryFromName(string categoryName, bool invert, bool caseInsen
/// </summary>
public void AddLowercase(CultureInfo culture)
{
_canonical = false;
List<SingleRange>? rangeList = _rangelist;
if (rangeList != null)
{
......@@ -1341,10 +1318,7 @@ public string ToStringClass()
private void ToStringClass(ref ValueStringBuilder vsb)
{
if (!_canonical)
{
Canonicalize();
}
Canonicalize();
int initialLength = vsb.Length;
int categoriesLength = _categories?.Length ?? 0;
......@@ -1390,8 +1364,6 @@ private void ToStringClass(ref ValueStringBuilder vsb)
/// </summary>
private void Canonicalize()
{
_canonical = true;
List<SingleRange>? rangelist = _rangelist;
if (rangelist != null)
{
......
......@@ -15,7 +15,6 @@
// Strings and sets are indices into a string table.
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
......@@ -395,7 +394,10 @@ public string OpcodeDescription(int offset)
}
[ExcludeFromCodeCoverage]
public void Dump()
public void Dump() => Debug.WriteLine(ToString());
[ExcludeFromCodeCoverage]
public override string ToString()
{
var sb = new StringBuilder();
......@@ -426,7 +428,7 @@ public void Dump()
}
sb.AppendLine();
Debug.WriteLine(sb.ToString());
return sb.ToString();
}
#endif
}
......
......@@ -37,6 +37,9 @@ internal RegexTree(RegexNode root, Hashtable caps, int[] capNumList, int capTop,
[ExcludeFromCodeCoverage]
public void Dump() => Root.Dump();
[ExcludeFromCodeCoverage]
public override string ToString() => Root.ToString();
[ExcludeFromCodeCoverage]
public bool Debug => (Options & RegexOptions.Debug) != 0;
#endif
......
......@@ -730,6 +730,8 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } };
yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } };
yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } };
// Nested atomic
yield return new object[] { null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } };
// Anchoring loops beginning with .* / .+
yield return new object[] { null, @".*", "", RegexOptions.None, new string[] { "" } };
......
......@@ -997,6 +997,17 @@ public void IsMatch_Invalid()
Assert.Throws<ArgumentOutOfRangeException>(() => r.IsMatch("input", 6));
}
[SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)] // take too long due to backtracking
[Theory]
[InlineData(@"(\w*)+\.", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", false)]
[InlineData(@"(a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", false)]
[InlineData(@"(x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", false)]
public void IsMatch_SucceedQuicklyDueToAutoAtomicity(string regex, string input, bool expected)
{
Assert.Equal(expected, Regex.IsMatch(input, regex, RegexOptions.None));
Assert.Equal(expected, Regex.IsMatch(input, regex, RegexOptions.Compiled));
}
[Fact]
public void Synchronized()
{
......
......@@ -42,15 +42,20 @@ static RegexReductionTests()
Assert.NotNull(s_regexCodeTreeMinRequiredLength);
}
private static int[] GetRegexCodes(Regex r)
private static string GetRegexCodes(Regex r)
{
object code = s_regexCode.GetValue(r);
Assert.NotNull(code);
string result = code.ToString();
// In release builds, the above ToString won't be informative.
// Also include the numerical codes, which are not as comprehensive
// but which exist in release builds as well.
int[] codes = s_regexCodeCodes.GetValue(code) as int[];
Assert.NotNull(codes);
result += Environment.NewLine + string.Join(", ", codes);
return codes;
return result;
}
private static int GetMinRequiredLength(Regex r)
......@@ -267,8 +272,19 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("[0-9][0-9]{1,3}?", "[0-9]{2,4}?")]
// Set and set
[InlineData("[ace][ace]", "[ace]{2}")]
// Set and one
[InlineData("[a]", "a")]
[InlineData("[a]*", "a*")]
[InlineData("(?>[a]*)", "(?>a*)")]
[InlineData("[a]*?", "a*?")]
// Set and notone
[InlineData("[^\n]", ".")]
[InlineData("[^\n]*", ".*")]
[InlineData("(?>[^\n]*)", "(?>.*)")]
[InlineData("[^\n]*?", ".*?")]
// Large loop patterns
[InlineData("a*a*a*a*a*a*a*b*b*?a+a*", "a*b*b*?a+")]
[InlineData("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "a{0,30}aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")]
// Group elimination
[InlineData("(?:(?:(?:(?:(?:(?:a*))))))", "a*")]
// Nested loops
......@@ -276,6 +292,8 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("(?:a*)+", "a*")]
[InlineData("(?:a+){4}", "a{4,}")]
[InlineData("(?:a{1,2}){4}", "a{4,8}")]
// Nested atomic
[InlineData("(?>(?>(?>(?>abc*))))", "(?>ab(?>c*))")]
// Alternation reduction
[InlineData("a|b", "[ab]")]
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
......@@ -319,13 +337,23 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
[InlineData("apple|(?:orange|pear)|grape", "apple|orange|pear|grape")]
[InlineData("(?>(?>(?>(?:abc)*)))", "(?:abc)*")]
[InlineData("(w*)+", "((?>w*))+")]
[InlineData("(w*)+\\.", "((?>w*))+\\.")]
[InlineData("(a[bcd]e*)*fg", "(a[bcd](?>e*))*fg")]
[InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")]
public void PatternsReduceIdentically(string pattern1, string pattern2)
{
AssertExtensions.Equal(GetRegexCodes(new Regex(pattern1)), GetRegexCodes(new Regex(pattern2)));
Assert.NotEqual<int>(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
string result1 = GetRegexCodes(new Regex(pattern1));
string result2 = GetRegexCodes(new Regex(pattern2));
if (result1 != result2)
{
throw new Xunit.Sdk.EqualException(result2, result1);
}
Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:"))
{
Assert.NotEqual<int>(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
}
}
......@@ -376,11 +404,13 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
// Not applying auto-atomicity
[InlineData("a*b*", "(?>a*)b*")]
[InlineData("[^\n]*\n*", "(?>[^\n]*)\n")]
[InlineData("(a[bcd]a*)*fg", "(a[bcd](?>a*))*fg")]
[InlineData("(\\w[bcd]\\d*)*fg", "(\\w[bcd](?>\\d*))*fg")]
public void PatternsReduceDifferently(string pattern1, string pattern2)
{
var r1 = new Regex(pattern1);
var r2 = new Regex(pattern2);
Assert.NotEqual<int>(GetRegexCodes(r1), GetRegexCodes(r2));
Assert.NotEqual(GetRegexCodes(r1), GetRegexCodes(r2));
}
[Theory]
......@@ -409,6 +439,11 @@ public void PatternsReduceDifferently(string pattern1, string pattern2)
[InlineData(@"a*a*a*a*a*a*a*b*", 0)]
[InlineData(@"((a{1,2}){4}){3,7}", 12)]
[InlineData(@"\b\w{4}\b", 4)]
// we stop computing after a certain depth; if that logic changes in the future, these tests can be updated
[InlineData(@"((((((((((((((((((((((((((((((ab|cd+)|ef+)|gh+)|ij+)|kl+)|mn+)|op+)|qr+)|st+)|uv+)|wx+)|yz+)|01+)|23+)|45+)|67+)|89+)|AB+)|CD+)|EF+)|GH+)|IJ+)|KL+)|MN+)|OP+)|QR+)|ST+)|UV+)|WX+)|YZ)", 0)]
[InlineData(@"(YZ+|(WX+|(UV+|(ST+|(QR+|(OP+|(MN+|(KL+|(IJ+|(GH+|(EF+|(CD+|(AB+|(89+|(67+|(45+|(23+|(01+|(yz+|(wx+|(uv+|(st+|(qr+|(op+|(mn+|(kl+|(ij+|(gh+|(ef+|(de+|(a|bc+)))))))))))))))))))))))))))))))", 0)]
[InlineData(@"a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(ab|cd+)|ef+)|gh+)|ij+)|kl+)|mn+)|op+)|qr+)|st+)|uv+)|wx+)|yz+)|01+)|23+)|45+)|67+)|89+)|AB+)|CD+)|EF+)|GH+)|IJ+)|KL+)|MN+)|OP+)|QR+)|ST+)|UV+)|WX+)|YZ+)", 3)]
[InlineData(@"(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))", 0)]
public void MinRequiredLengthIsCorrect(string pattern, int expectedLength)
{
var r = new Regex(pattern);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册