From 2fb306e0b0788cc1e8d06e74afa3445ea59897ab Mon Sep 17 00:00:00 2001 From: Cyrus Najmabadi Date: Sun, 31 Dec 2017 10:55:54 -0800 Subject: [PATCH] Initial work on a regex classifier. --- ...rpValidateRegexStringDiagnosticAnalyzer.cs | 9 +- ...ctValidateRegexStringDiagnosticAnalyzer.cs | 233 +++--------------- .../RegexPatternTokenClassifier.cs | 60 +++++ .../CSharpSemanticFactsService.cs | 3 + .../RegexPatternDetector.cs | 80 +++--- .../VisualBasicSemanticFactsService.vb | 4 + 6 files changed, 149 insertions(+), 240 deletions(-) create mode 100644 src/Workspaces/CSharp/Portable/Classification/SyntaxClassification/RegexPatternTokenClassifier.cs rename src/{Features/Core/Portable/ValidateRegexString => Workspaces/Core/Portable/RegularExpressions}/RegexPatternDetector.cs (73%) diff --git a/src/Features/CSharp/Portable/ValidateRegexString/CSharpValidateRegexStringDiagnosticAnalyzer.cs b/src/Features/CSharp/Portable/ValidateRegexString/CSharpValidateRegexStringDiagnosticAnalyzer.cs index 8dad7036c44..18e0b8fd05b 100644 --- a/src/Features/CSharp/Portable/ValidateRegexString/CSharpValidateRegexStringDiagnosticAnalyzer.cs +++ b/src/Features/CSharp/Portable/ValidateRegexString/CSharpValidateRegexStringDiagnosticAnalyzer.cs @@ -1,9 +1,6 @@ // Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. -using System.Threading; -using Microsoft.CodeAnalysis.CSharp.Extensions; using Microsoft.CodeAnalysis.CSharp.RegularExpressions; -using Microsoft.CodeAnalysis.CSharp.Syntax; using Microsoft.CodeAnalysis.Diagnostics; using Microsoft.CodeAnalysis.LanguageServices; using Microsoft.CodeAnalysis.RegularExpressions; @@ -19,12 +16,12 @@ public CSharpValidateRegexStringDiagnosticAnalyzer() { } - protected override IParameterSymbol DetermineParameter(SemanticModel semanticModel, SyntaxNode argumentNode, CancellationToken cancellationToken) - => ((ArgumentSyntax)argumentNode).DetermineParameter(semanticModel, allowParams: false, cancellationToken); - protected override ISyntaxFactsService GetSyntaxFactsService() => CSharpSyntaxFactsService.Instance; + protected override ISemanticFactsService GetSemanticFactsService() + => CSharpSemanticFactsService.Instance; + protected override IVirtualCharService GetVirtualCharService() => CSharpVirtualCharService.Instance; } diff --git a/src/Features/Core/Portable/ValidateRegexString/AbstractValidateRegexStringDiagnosticAnalyzer.cs b/src/Features/Core/Portable/ValidateRegexString/AbstractValidateRegexStringDiagnosticAnalyzer.cs index 49ad10f2178..b5951b503f1 100644 --- a/src/Features/Core/Portable/ValidateRegexString/AbstractValidateRegexStringDiagnosticAnalyzer.cs +++ b/src/Features/Core/Portable/ValidateRegexString/AbstractValidateRegexStringDiagnosticAnalyzer.cs @@ -1,18 +1,11 @@ // Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Diagnostics; -using System.Linq; using System.Text.RegularExpressions; using System.Threading; using Microsoft.CodeAnalysis.CodeStyle; using Microsoft.CodeAnalysis.Diagnostics; using Microsoft.CodeAnalysis.LanguageServices; using Microsoft.CodeAnalysis.RegularExpressions; -using Microsoft.CodeAnalysis.Shared.Extensions; -using Roslyn.Utilities; namespace Microsoft.CodeAnalysis.ValidateRegexString { @@ -20,7 +13,6 @@ internal abstract class AbstractValidateRegexStringDiagnosticAnalyzer false; protected abstract ISyntaxFactsService GetSyntaxFactsService(); + protected abstract ISemanticFactsService GetSemanticFactsService(); protected abstract IVirtualCharService GetVirtualCharService(); - protected abstract IParameterSymbol DetermineParameter(SemanticModel semanticModel, SyntaxNode argumentNode, CancellationToken cancellationToken); - protected override void InitializeWorker(AnalysisContext context) => context.RegisterSemanticModelAction(AnalyzeSemanticModel); @@ -62,221 +53,63 @@ private void AnalyzeSemanticModel(SemanticModelAnalysisContext context) return; } - var regexType = semanticModel.Compilation.GetTypeByMetadataName(typeof(Regex).FullName); - if (regexType == null) + var detector = RegexPatternDetector.TryCreate( + semanticModel, GetSyntaxFactsService(), GetSemanticFactsService()); + if (detector == null) { return; } - var syntaxFacts = GetSyntaxFactsService(); - var methodNamesOfInterest = GetMethodNamesOfInterest(regexType, syntaxFacts); - var root = syntaxTree.GetRoot(cancellationToken); - - var analyzer = new Analyzer(this, context, regexType, methodNamesOfInterest); - analyzer.Analyze(root); - } - - private HashSet GetMethodNamesOfInterest(INamedTypeSymbol regexType, ISyntaxFactsService syntaxFacts) - { - var result = syntaxFacts.IsCaseSensitive - ? new HashSet() - : new HashSet(StringComparer.OrdinalIgnoreCase); - - var methods = from method in regexType.GetMembers().OfType() - where method.DeclaredAccessibility == Accessibility.Public - where method.IsStatic - where method.Parameters.Any(p => p.Name == _patternName) - select method.Name; - - result.AddRange(methods); - - return result; + Analyze(context, detector, root, cancellationToken); } - private struct Analyzer + private void Analyze( + SemanticModelAnalysisContext context, RegexPatternDetector detector, + SyntaxNode node, CancellationToken cancellationToken) { - private readonly AbstractValidateRegexStringDiagnosticAnalyzer _analyzer; - private readonly SemanticModelAnalysisContext _context; - private readonly SemanticModel _semanticModel; - private readonly ISyntaxFactsService _syntaxFacts; - private readonly INamedTypeSymbol _regexType; - private readonly HashSet _methodNamesOfInterest; - private readonly CancellationToken _cancellationToken; + cancellationToken.ThrowIfCancellationRequested(); - public Analyzer( - AbstractValidateRegexStringDiagnosticAnalyzer analyzer, - SemanticModelAnalysisContext context, INamedTypeSymbol regexType, - HashSet methodNamesOfInterest) + foreach (var child in node.ChildNodesAndTokens()) { - _analyzer = analyzer; - _context = context; - _semanticModel = context.SemanticModel; - _syntaxFacts = analyzer.GetSyntaxFactsService(); - _regexType = regexType; - _methodNamesOfInterest = methodNamesOfInterest; - _cancellationToken = context.CancellationToken; - } - - public void Analyze(SyntaxNode node) - { - _cancellationToken.ThrowIfCancellationRequested(); - - foreach (var child in node.ChildNodesAndTokens()) + if (child.IsNode) { - if (child.IsNode) - { - Analyze(child.AsNode()); - } - else - { - var token = child.AsToken(); - if (token.RawKind == _analyzer._stringLiteralKind) - { - AnalyzeStringLiteral(token); - } - } + Analyze(context, detector, child.AsNode(), cancellationToken); } - } - - private void AnalyzeStringLiteral(SyntaxToken stringLiteral) - { - var literalNode = stringLiteral.Parent; - var argumentNode = literalNode.Parent; - if (!_syntaxFacts.IsArgument(argumentNode)) - { - return; - } - - var argumentList = argumentNode.Parent; - var invocationOrCreation = argumentList.Parent; - if (_syntaxFacts.IsInvocationExpression(invocationOrCreation)) - { - var invokedExpression = _syntaxFacts.GetExpressionOfInvocationExpression(invocationOrCreation); - var name = GetNameOfInvokedExpression(invokedExpression); - if (!_methodNamesOfInterest.Contains(name)) - { - return; - } - - // Is a string argument to a method that looks like it could be a Regex method. - // Need to do deeper analysis - var method = _semanticModel.GetSymbolInfo(invocationOrCreation, _cancellationToken).GetAnySymbol(); - if (method?.ContainingType != _regexType) - { - return; - } - - AnalyzeStringLiteral(stringLiteral, argumentNode); - } - else if (_syntaxFacts.IsObjectCreationExpression(invocationOrCreation)) + else { - var typeNode = _syntaxFacts.GetObjectCreationType(invocationOrCreation); - var name = GetNameOfType(typeNode, _syntaxFacts); - if (name == null) + var token = child.AsToken(); + if (token.RawKind == _stringLiteralKind && + detector.IsRegexPattern(token, cancellationToken, out var options)) { - return; + AnalyzePattern(context, token, options); } - - if (_syntaxFacts.StringComparer.Compare(nameof(Regex), name) != 0) - { - return; - } - - // Argument to "new Regex". Need to do deeper analysis - AnalyzeStringLiteral(stringLiteral, argumentNode); - } - else - { - return; - } - } - - private void AnalyzeStringLiteral(SyntaxToken stringLiteral, SyntaxNode argumentNode) - { - var parameter = _analyzer.DetermineParameter(_semanticModel, argumentNode, _cancellationToken); - if (parameter?.Name != _patternName) - { - return; - } - - var options = GetRegexOptions(argumentNode); - - var service = _analyzer.GetVirtualCharService(); - if (service == null) - { - return; - } - - var virtualChars = service.TryConvertToVirtualChars(stringLiteral); - if (virtualChars.IsDefaultOrEmpty) - { - return; - } - - var tree = RegexParser.Parse(virtualChars, options); - foreach (var diag in tree.Diagnostics) - { - _context.ReportDiagnostic(Diagnostic.Create( - _analyzer.GetDescriptorWithSeverity(DiagnosticSeverity.Warning), - Location.Create(_semanticModel.SyntaxTree, diag.Span), - diag.Message)); } } + } - private RegexOptions GetRegexOptions(SyntaxNode argumentNode) + private void AnalyzePattern( + SemanticModelAnalysisContext context, SyntaxToken stringLiteral, RegexOptions options) + { + var service = this.GetVirtualCharService(); + if (service == null) { - var argumentList = argumentNode.Parent; - var arguments = _syntaxFacts.GetArgumentsOfArgumentList(argumentList); - foreach (var siblingArg in arguments) - { - if (siblingArg != argumentNode) - { - var expr = _syntaxFacts.GetExpressionOfArgument(siblingArg); - if (expr != null) - { - var exprType = _semanticModel.GetTypeInfo(expr, _cancellationToken); - if (exprType.Type?.Name == nameof(RegexOptions)) - { - var constVal = _semanticModel.GetConstantValue(expr, _cancellationToken); - if (constVal.HasValue) - { - return (RegexOptions)(int)constVal.Value; - } - } - } - } - } - - return RegexOptions.None; + return; } - private string GetNameOfType(SyntaxNode typeNode, ISyntaxFactsService syntaxFacts) + var virtualChars = service.TryConvertToVirtualChars(stringLiteral); + if (virtualChars.IsDefaultOrEmpty) { - if (syntaxFacts.IsQualifiedName(typeNode)) - { - return GetNameOfType(syntaxFacts.GetRightSideOfDot(typeNode), syntaxFacts); - } - else if (syntaxFacts.IsIdentifierName(typeNode)) - { - return syntaxFacts.GetIdentifierOfSimpleName(typeNode).ValueText; - } - - return null; + return; } - private string GetNameOfInvokedExpression(SyntaxNode invokedExpression) + var tree = RegexParser.Parse(virtualChars, options); + foreach (var diag in tree.Diagnostics) { - if (_syntaxFacts.IsSimpleMemberAccessExpression(invokedExpression)) - { - return _syntaxFacts.GetIdentifierOfSimpleName(_syntaxFacts.GetNameOfMemberAccessExpression(invokedExpression)).ValueText; - } - else if (_syntaxFacts.IsIdentifierName(invokedExpression)) - { - return _syntaxFacts.GetIdentifierOfSimpleName(invokedExpression).ValueText; - } - - return null; + context.ReportDiagnostic(Diagnostic.Create( + this.GetDescriptorWithSeverity(DiagnosticSeverity.Warning), + Location.Create(context.SemanticModel.SyntaxTree, diag.Span), + diag.Message)); } } } diff --git a/src/Workspaces/CSharp/Portable/Classification/SyntaxClassification/RegexPatternTokenClassifier.cs b/src/Workspaces/CSharp/Portable/Classification/SyntaxClassification/RegexPatternTokenClassifier.cs new file mode 100644 index 00000000000..68767e373a4 --- /dev/null +++ b/src/Workspaces/CSharp/Portable/Classification/SyntaxClassification/RegexPatternTokenClassifier.cs @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System; +using System.Collections.Immutable; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading; +using Microsoft.CodeAnalysis.Classification; +using Microsoft.CodeAnalysis.CSharp.RegularExpressions; +using Microsoft.CodeAnalysis.PooledObjects; +using Microsoft.CodeAnalysis.RegularExpressions; + +namespace Microsoft.CodeAnalysis.CSharp.Classification.Classifiers +{ + internal class RegexPatternTokenClassifier : AbstractSyntaxClassifier + { + private static readonly ConditionalWeakTable _modelToDetector = + new ConditionalWeakTable(); + + public override ImmutableArray SyntaxTokenKinds { get; } = ImmutableArray.Create((int)SyntaxKind.StringLiteralToken); + + public override void AddClassifications(SyntaxToken token, SemanticModel semanticModel, ArrayBuilder result, CancellationToken cancellationToken) + { + Debug.Assert(token.Kind() == SyntaxKind.StringLiteralToken); + + // Do some quick syntactic checks before doing any complex work. + if (RegexPatternDetector.IsDefinitelyNotPattern(token, CSharpSyntaxFactsService.Instance)) + { + return; + } + + // Looks like it could be a regex pattern. Do more complex check. + // Cache the detector we create, so we don't have to continually do + // the same semantic work for every string literal token we visit. + var detector = _modelToDetector.GetValue( + semanticModel, m => RegexPatternDetector.TryCreate( + m, CSharpSyntaxFactsService.Instance, CSharpSemanticFactsService.Instance)); + + if (!detector.IsRegexPattern(token, cancellationToken, out var options)) + { + return; + } + + var virtualCharService = CSharpVirtualCharService.Instance; + var chars = virtualCharService.TryConvertToVirtualChars(token); + if (chars.IsDefaultOrEmpty) + { + return; + } + + var tree = RegexParser.Parse(chars, options); + AddClassifications(tree, result); + } + + private void AddClassifications(RegexTree tree, ArrayBuilder result) + { + throw new NotImplementedException(); + } + } +} diff --git a/src/Workspaces/CSharp/Portable/LanguageServices/CSharpSemanticFactsService.cs b/src/Workspaces/CSharp/Portable/LanguageServices/CSharpSemanticFactsService.cs index 274a3aff5ff..dd3c0bec1ed 100644 --- a/src/Workspaces/CSharp/Portable/LanguageServices/CSharpSemanticFactsService.cs +++ b/src/Workspaces/CSharp/Portable/LanguageServices/CSharpSemanticFactsService.cs @@ -296,5 +296,8 @@ public bool IsPartial(ITypeSymbol typeSymbol, CancellationToken cancellationToke return SpecializedCollections.SingletonEnumerable( semanticModel.GetDeclaredSymbol(memberDeclaration, cancellationToken)); } + + public IParameterSymbol FindParameterForArgument(SemanticModel semanticModel, SyntaxNode argumentNode, CancellationToken cancellationToken) + => ((ArgumentSyntax)argumentNode).DetermineParameter(semanticModel, allowParams: false, cancellationToken); } } diff --git a/src/Features/Core/Portable/ValidateRegexString/RegexPatternDetector.cs b/src/Workspaces/Core/Portable/RegularExpressions/RegexPatternDetector.cs similarity index 73% rename from src/Features/Core/Portable/ValidateRegexString/RegexPatternDetector.cs rename to src/Workspaces/Core/Portable/RegularExpressions/RegexPatternDetector.cs index 27109547486..c6cae834b3e 100644 --- a/src/Features/Core/Portable/ValidateRegexString/RegexPatternDetector.cs +++ b/src/Workspaces/Core/Portable/RegularExpressions/RegexPatternDetector.cs @@ -4,16 +4,17 @@ using System.Collections.Generic; using System.Diagnostics; using System.Linq; -using System.Text; using System.Text.RegularExpressions; using System.Threading; using Microsoft.CodeAnalysis.LanguageServices; -using Microsoft.CodeAnalysis.RegularExpressions; using Microsoft.CodeAnalysis.Shared.Extensions; using Roslyn.Utilities; -namespace Microsoft.CodeAnalysis.ValidateRegexString +namespace Microsoft.CodeAnalysis.RegularExpressions { + /// + /// Helper class to detect regex pattern tokens in a document efficiently. + /// internal class RegexPatternDetector { private const string _patternName = "pattern"; @@ -23,29 +24,25 @@ internal class RegexPatternDetector private readonly ISemanticFactsService _semanticFacts; private readonly INamedTypeSymbol _regexType; private readonly HashSet _methodNamesOfInterest; - private readonly CancellationToken _cancellationToken; public RegexPatternDetector( SemanticModel semanticModel, ISyntaxFactsService syntaxFacts, ISemanticFactsService semanticFacts, INamedTypeSymbol regexType, - HashSet methodNamesOfInterest, - CancellationToken cancellationToken) + HashSet methodNamesOfInterest) { _semanticModel = semanticModel; _syntaxFacts = syntaxFacts; _semanticFacts = semanticFacts; _regexType = regexType; _methodNamesOfInterest = methodNamesOfInterest; - _cancellationToken = cancellationToken; } public static RegexPatternDetector TryCreate( SemanticModel semanticModel, ISyntaxFactsService syntaxFacts, - ISemanticFactsService semanticFacts, - CancellationToken cancellationToken) + ISemanticFactsService semanticFacts) { var regexType = semanticModel.Compilation.GetTypeByMetadataName(typeof(Regex).FullName); if (regexType == null) @@ -56,7 +53,22 @@ internal class RegexPatternDetector var methodNamesOfInterest = GetMethodNamesOfInterest(regexType, syntaxFacts); return new RegexPatternDetector( semanticModel, syntaxFacts, semanticFacts, - regexType, methodNamesOfInterest, cancellationToken); + regexType, methodNamesOfInterest); + } + + public static bool IsDefinitelyNotPattern(SyntaxToken token, ISyntaxFactsService syntaxFacts) + { + // We only support string literals passed in arguments to something. + // In the future we could support any string literal, as long as it has + // some marker (like a comment on it) stating it's a regex. + if (!syntaxFacts.IsStringLiteral(token) || + !syntaxFacts.IsLiteralExpression(token.Parent) || + !syntaxFacts.IsArgument(token.Parent.Parent)) + { + return true; + } + + return false; } private static HashSet GetMethodNamesOfInterest(INamedTypeSymbol regexType, ISyntaxFactsService syntaxFacts) @@ -76,27 +88,18 @@ where method.Parameters.Any(p => p.Name == _patternName) return result; } - public bool IsRegexPattern(SyntaxToken token, out RegexOptions options) + public bool IsRegexPattern(SyntaxToken token, CancellationToken cancellationToken, out RegexOptions options) { options = default; - if (!_syntaxFacts.IsStringLiteral(token)) + if (IsDefinitelyNotPattern(token, _syntaxFacts)) { return false; } - return Analyze(token, out options); - } - - private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options) - { - options = default; - + var stringLiteral = token; var literalNode = stringLiteral.Parent; var argumentNode = literalNode.Parent; - if (!_syntaxFacts.IsArgument(argumentNode)) - { - return false; - } + Debug.Assert(_syntaxFacts.IsArgument(argumentNode)); var argumentList = argumentNode.Parent; var invocationOrCreation = argumentList.Parent; @@ -108,10 +111,13 @@ private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options) { // Is a string argument to a method that looks like it could be a Regex method. // Need to do deeper analysis - var method = _semanticModel.GetSymbolInfo(invocationOrCreation, _cancellationToken).GetAnySymbol(); - if (method?.ContainingType == _regexType) + var method = _semanticModel.GetSymbolInfo(invocationOrCreation, cancellationToken).GetAnySymbol(); + if (method.DeclaredAccessibility == Accessibility.Public && + method.IsStatic && + _regexType.Equals(method?.ContainingType)) { - return AnalyzeStringLiteral(stringLiteral, argumentNode, out options); + return AnalyzeStringLiteral( + stringLiteral, argumentNode, cancellationToken, out options); } } } @@ -123,8 +129,13 @@ private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options) { if (_syntaxFacts.StringComparer.Compare(nameof(Regex), name) == 0) { - // Argument to "new Regex". Need to do deeper analysis - return AnalyzeStringLiteral(stringLiteral, argumentNode, out options); + var typeSymbol = _semanticModel.GetTypeInfo(typeNode, cancellationToken).Type; + if (_regexType.Equals(typeSymbol)) + { + // Argument to "new Regex". Need to do deeper analysis + return AnalyzeStringLiteral( + stringLiteral, argumentNode, cancellationToken, out options); + } } } } @@ -133,21 +144,22 @@ private bool Analyze(SyntaxToken stringLiteral, out RegexOptions options) } private bool AnalyzeStringLiteral( - SyntaxToken stringLiteral, SyntaxNode argumentNode, out RegexOptions options) + SyntaxToken stringLiteral, SyntaxNode argumentNode, + CancellationToken cancellationToken, out RegexOptions options) { options = default; - var parameter = _semanticFacts.FindParameterForArgument(_semanticModel, argumentNode, _cancellationToken); + var parameter = _semanticFacts.FindParameterForArgument(_semanticModel, argumentNode, cancellationToken); if (parameter?.Name != _patternName) { return false; } - options = GetRegexOptions(argumentNode); + options = GetRegexOptions(argumentNode, cancellationToken); return true; } - private RegexOptions GetRegexOptions(SyntaxNode argumentNode) + private RegexOptions GetRegexOptions(SyntaxNode argumentNode, CancellationToken cancellationToken) { var argumentList = argumentNode.Parent; var arguments = _syntaxFacts.GetArgumentsOfArgumentList(argumentList); @@ -158,10 +170,10 @@ private RegexOptions GetRegexOptions(SyntaxNode argumentNode) var expr = _syntaxFacts.GetExpressionOfArgument(siblingArg); if (expr != null) { - var exprType = _semanticModel.GetTypeInfo(expr, _cancellationToken); + var exprType = _semanticModel.GetTypeInfo(expr, cancellationToken); if (exprType.Type?.Name == nameof(RegexOptions)) { - var constVal = _semanticModel.GetConstantValue(expr, _cancellationToken); + var constVal = _semanticModel.GetConstantValue(expr, cancellationToken); if (constVal.HasValue) { return (RegexOptions)(int)constVal.Value; diff --git a/src/Workspaces/VisualBasic/Portable/LanguageServices/VisualBasicSemanticFactsService.vb b/src/Workspaces/VisualBasic/Portable/LanguageServices/VisualBasicSemanticFactsService.vb index d8eba0fef3f..a6d69f2d134 100644 --- a/src/Workspaces/VisualBasic/Portable/LanguageServices/VisualBasicSemanticFactsService.vb +++ b/src/Workspaces/VisualBasic/Portable/LanguageServices/VisualBasicSemanticFactsService.vb @@ -290,5 +290,9 @@ Namespace Microsoft.CodeAnalysis.VisualBasic Return {semanticModel.GetDeclaredSymbol(memberDeclaration, cancellationToken)} End Function + + Public Function FindParameterForArgument(semanticModel As SemanticModel, argumentNode As SyntaxNode, cancellationToken As CancellationToken) As IParameterSymbol Implements ISemanticFactsService.FindParameterForArgument + Return DirectCast(argumentNode, ArgumentSyntax).DetermineParameter(semanticModel, allowParamArray:=False, cancellationToken) + End Function End Class End Namespace -- GitLab