Merge branch 'regexParsing4' into regexFeatures

622ef53a · Cyrus Najmabadi · bb253d2e · b58c0c7a · 622ef53a · 622ef53a
4 changed file
--- a/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexKind.cs
+++ b/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexKind.cs
@@ -4,7 +4,7 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
 {
    internal enum RegexKind
    {
-        None,
+        None = 0,
        EndOfFile,
        Sequence,
        CompilationUnit,

--- a/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexLexer.cs
+++ b/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexLexer.cs
@@ -401,7 +401,7 @@ public RegexToken ScanHexCharacters(int count)
            {
                result = result.AddDiagnosticIfNone(new EmbeddedDiagnostic(
                    WorkspacesResources.Insufficient_hexadecimal_digits,
-                    TextSpan.FromBounds(Text[beforeSlash].Span.Start, Text[Position - 1].Span.End)));
+                    GetTextSpan(beforeSlash, Position)));
            }
            return result;

--- a/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexNodes.cs
+++ b/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexNodes.cs
@@ -501,7 +501,8 @@ public override void Accept(IRegexNodeVisitor visitor)
    }
    /// <summary>
-    /// ```a{...}```
+    /// Base type of all regex numeric quantifier nodes.  i.e.  
+    /// ```a{5}```,  ```a{5,}``` and ```a{5,10}```
    /// </summary>
    internal abstract class RegexNumericQuantifierNode : RegexQuantifierNode
    {

--- a/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexParser.cs
+++ b/src/Workspaces/Core/Portable/EmbeddedLanguages/RegularExpressions/RegexParser.cs
@@ -43,13 +43,18 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
    /// same error.  Note: there is only one time we do this in this parser (see the deviation
    /// documented in <see cref="ParsePossibleEcmascriptBackreferenceEscape"/>).
    ///
-    /// Note1: the above invariants make life difficult at times.  This happens due to the fact that
+    /// Note1: "report the same error" means that we will attempt to report the error using the same
+    /// text the .net regex parser uses for its error messages.  This is so that the user is not
+    /// confused when they use the IDE vs running the regex by getting different messages for the
+    /// same issue.
+    ///
+    /// Note2: the above invariants make life difficult at times.  This happens due to the fact that
    /// the .net parser is multi-pass.  Meaning it does a first scan (which may report errors), then
    /// does the full parse.  This means that it might report an error in a later location during
    /// the initial scan than it would during the parse.  We replicate that behavior to follow the
    /// second invariant.
    ///
-    /// Note2: It would be nice if we could check these invariants at runtime, so we could control
+    /// Note3: It would be nice if we could check these invariants at runtime, so we could control
    /// our behavior by the behavior of the real .net regex engine.  For example, if the .net regex
    /// engine did not report any issues, we could suppress any diagnostics we generated and we
    /// could log an NFW to record which pattern we deviated on so we could fix the issue for a
@@ -62,6 +67,13 @@ namespace Microsoft.CodeAnalysis.EmbeddedLanguages.RegularExpressions
    ///
    /// This parser was based off the corefx RegexParser based at:
    /// https://github.com/dotnet/corefx/blob/f759243d724f462da0bcef54e86588f8a55352c6/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1
+    ///
+    /// Note4: The .Net parser itself changes over time (for example to fix behavior that even it
+    /// thinks is buggy).  When this happens, we have to make a choice as to which behavior to
+    /// follow. In general, the overall principle is that we should follow the more lenient
+    /// behavior.  If we end up taking the more strict interpretation we risk giving people an error
+    /// during design time that they would not get at runtime.  It's far worse to have that than to
+    /// not report an error, even though one might happen later.
    /// </remarks>
    internal partial struct RegexParser
    {
@@ -243,17 +255,13 @@ private RegexSequenceNode ParseSequence(bool consumeCloseParen)
        {
            var list = ArrayBuilder<RegexExpressionNode>.GetInstance();
-            if (ShouldConsumeSequenceElement(consumeCloseParen))
+            while (ShouldConsumeSequenceElement(consumeCloseParen))
-            {
-                do
            {
                var last = list.Count == 0 ? null : list.Last();
                list.Add(ParsePrimaryExpressionAndQuantifiers(last));
                TryMergeLastTwoNodes(list);
            }
-                while (ShouldConsumeSequenceElement(consumeCloseParen));
-            }
            return new RegexSequenceNode(list.ToImmutableAndFree());
        }
@@ -726,8 +734,8 @@ private RegexConditionalGroupingNode ParseConditionalGrouping(RegexToken openPar
            }
            else
            {
-                // If its a capture name, it's ok if it that capture doesn't exist.  In that
+                // If it's a capture name, it's ok if that capture doesn't exist.  In that case we
-                // case we will just treat this as an conditional expression.
+                // will just treat this as an conditional expression.
                if (!HasCapture((string)capture.Value))
                {
                    _lexer.Position = afterInnerOpenParen;
@@ -1284,14 +1292,14 @@ private bool TryGetRangeComponentValueWorker(RegexNode component, out char ch)
                    ch = ((RegexSimpleEscapeNode)component).TypeToken.VirtualChars[0];
                    switch (ch)
                    {
-                        case 'a': ch = '\u0007'; break;
+                        case 'a': ch = '\u0007'; break; // bell
-                        case 'b': ch = '\b'; break;
+                        case 'b': ch = '\b'; break;     // backspace
-                        case 'e': ch = '\u001B'; break;
+                        case 'e': ch = '\u001B'; break; // escape
-                        case 'f': ch = '\f'; break;
+                        case 'f': ch = '\f'; break;     // form feed
-                        case 'n': ch = '\n'; break;
+                        case 'n': ch = '\n'; break;     // new line
-                        case 'r': ch = '\r'; break;
+                        case 'r': ch = '\r'; break;     // carriage return
-                        case 't': ch = '\t'; break;
+                        case 't': ch = '\t'; break;     // tab
-                        case 'v': ch = '\u000B'; break;
+                        case 'v': ch = '\u000B'; break; // vertical tab
                    }
                    return true;
@@ -1889,6 +1897,21 @@ private RegexControlEscapeNode ParseControlEscape(RegexToken backslashToken, boo
            unchecked
            {
+                // From: https://github.com/dotnet/corefx/blob/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs#L1450
+                // Note: Roslyn accepts a control escape that current .Net parser does not.
+                // Specifically: \c[
+                //
+                // It is a bug that the .Net parser does not support this construct.  The bug was
+                // reported at: https://github.com/dotnet/corefx/issues/26501 and was fixed for
+                // CoreFx with https://github.com/dotnet/corefx/commit/80e220fc7009de0f0611ee6b52d4d5ffd25eb6c7
+                //
+                // Because it was a bug, we follow the correct behavior.  That means we will not
+                // report a diagnostic for a Regex that someone might run on a previous version of
+                // .Net that ends up throwing at runtime.  That's acceptable.  Our goal is to match
+                // the latest .Net 'correct' behavior.  Not intermediary points with bugs that have
+                // since been fixed.
                // \ca interpreted as \cA
                if (ch >= 'a' && ch <= 'z')
                {