提交 b91400c4 编写于 作者: C CyrusNajmabadi

Merge pull request #7390 from CyrusNajmabadi/fuzzyMatchingIndex

Provide a faster implementation of Fuzzy Searching for 'Add Using'.
......@@ -80,9 +80,8 @@ public async Task TestGenericWithWrongArgs1()
[WpfFact, Trait(Traits.Feature, Traits.Features.CodeActionsAddUsing)]
public async Task TestGenericWithWrongArgs2()
{
await TestAsync(
@"class Class { [|List<int,string>|] Method() { Foo(); } }",
@"using System.Collections.Generic; class Class { SortedList<int,string> Method() { Foo(); } }");
await TestMissingAsync(
@"class Class { [|List<int,string>|] Method() { Foo(); } }");
}
[WpfFact, Trait(Traits.Feature, Traits.Features.CodeActionsAddUsing)]
......
......@@ -166,9 +166,8 @@ NewLines("Class Foo \n Function F() As [|List(Of Integer, String, Boolean)|] \n
<WpfFact, Trait(Traits.Feature, Traits.Features.CodeActionsAddImport)>
Public Async Function TestGenericWithWrongArgs2() As Task
Await TestAsync(
NewLines("Class Foo \n Function F() As [|List(Of Integer, String)|] \n End Function \n End Class"),
NewLines("Imports System.Collections.Generic \n Class Foo \n Function F() As SortedList(Of Integer, String) \n End Function \n End Class"))
Await TestMissingAsync(
NewLines("Class Foo \n Function F() As [|List(Of Integer, String)|] \n End Function \n End Class"))
End Function
<WpfFact, Trait(Traits.Feature, Traits.Features.CodeActionsAddImport)>
......
......@@ -34,7 +34,7 @@ public async Task<IEnumerable<SearchResult<ISymbol>>> FindDeclarationsAsync(stri
return SpecializedCollections.EmptyEnumerable<SearchResult<ISymbol>>();
}
var query = this.Exact ? new SearchQuery(name, ignoreCase: true) : new SearchQuery(GetInexactPredicate(name));
var query = this.Exact ? SearchQuery.Create(name, ignoreCase: true) : SearchQuery.CreateFuzzy(name);
var symbols = await FindDeclarationsAsync(name, filter, query).ConfigureAwait(false);
if (Exact)
......@@ -47,27 +47,17 @@ public async Task<IEnumerable<SearchResult<ISymbol>>> FindDeclarationsAsync(stri
// TODO(cyrusn): It's a shame we have to compute this twice. However, there's no
// great way to store the original value we compute because it happens deep in the
// compiler bowels when we call FindDeclarations.
return symbols.Select(s =>
using (var similarityChecker = new WordSimilarityChecker(name))
{
double matchCost;
var isCloseMatch = EditDistance.IsCloseMatch(name, s.Name, out matchCost);
Debug.Assert(isCloseMatch);
return SearchResult.Create(s.Name, nameNode, s, matchCost);
}).ToList();
}
private Func<string, bool> GetInexactPredicate(string name)
{
// Create the edit distance object outside of the lambda That way we only create it
// once and it can cache all the information it needs while it does the IsCloseMatch
// check against all the possible candidates.
var editDistance = new EditDistance(name);
return n =>
{
double matchCost;
return editDistance.IsCloseMatch(n, out matchCost);
};
return symbols.Select(s =>
{
double matchCost;
var areSimilar = similarityChecker.AreSimilar(s.Name, out matchCost);
Debug.Assert(areSimilar);
return SearchResult.Create(s.Name, nameNode, s, matchCost);
}).ToList();
}
}
}
......
......@@ -63,7 +63,7 @@ private async Task CreateSpellCheckCodeIssueAsync(CodeFixContext context, TSimpl
var onlyConsiderGenerics = IsGeneric(nameNode);
var results = new MultiDictionary<double, string>();
using (var editDistance = new EditDistance(nameText))
using (var similarityChecker = new WordSimilarityChecker(nameText))
{
foreach (var item in completionList.Items)
{
......@@ -74,7 +74,7 @@ private async Task CreateSpellCheckCodeIssueAsync(CodeFixContext context, TSimpl
var candidateText = item.FilterText;
double matchCost;
if (!editDistance.IsCloseMatch(candidateText, out matchCost))
if (!similarityChecker.AreSimilar(candidateText, out matchCost))
{
continue;
}
......
......@@ -11,37 +11,101 @@
namespace Microsoft.CodeAnalysis.FindSymbols
{
// Search query parameters.
internal struct SearchQuery
internal enum SearchKind
{
// The predicate for matching names. Never null.
public readonly Func<string, bool> Predicate;
/// <summary>
/// Use an case-sensitive comparison when searching for matching items.
/// </summary>
Exact,
/// <summary>
/// Use a case-insensitive comparison when searching for matching items.
/// </summary>
ExactIgnoreCase,
// The name being searched for may be null in some cases. But can be used for faster
// index based searching if it is provided.
/// <summary>
/// Use a fuzzy comparison when searching for matching items. Fuzzy matching allows for
/// a certain amount of misspellings, missing words, etc. See <see cref="SpellChecker"/> for
/// more details.
/// </summary>
Fuzzy,
/// <summary>
/// Search term is matched in a custom manner (i.e. with a user provided predicate).
/// </summary>
Custom
}
internal class SearchQuery
{
/// <summary>The name being searched for. Is null in the case of custom predicate searching.. But
/// can be used for faster index based searching when it is available.</summary>
public readonly string Name;
public readonly bool IgnoreCase;
public SearchQuery(string name, bool ignoreCase):
this(n => ignoreCase ? CaseInsensitiveComparison.Comparer.Equals(name, n) : StringComparer.Ordinal.Equals(name, n))
///<summary>The kind of search this is. Faster index-based searching can be used if the
/// SearchKind is not <see cref="SearchKind.Custom"/>.</summary>
public readonly SearchKind Kind;
///<summary>The predicate to fall back on if faster index searching is not possible.</summary>
private readonly Func<string, bool> _predicate;
private SearchQuery(string name, SearchKind kind)
{
if (name == null)
{
throw new ArgumentNullException(nameof(name));
}
this.Name = name;
this.IgnoreCase = ignoreCase;
Name = name;
Kind = kind;
switch (kind)
{
case SearchKind.Exact:
_predicate = s => StringComparer.Ordinal.Equals(name, s);
break;
case SearchKind.ExactIgnoreCase:
_predicate = s => CaseInsensitiveComparison.Comparer.Equals(name, s);
break;
case SearchKind.Fuzzy:
// Create a single WordSimilarityChecker and capture a delegate reference to
// its 'AreSimilar' method. That way we only create the WordSimilarityChecker
// once and it can cache all the information it needs while it does the AreSimilar
// check against all the possible candidates.
var editDistance = new WordSimilarityChecker(name);
_predicate = editDistance.AreSimilar;
break;
}
}
public SearchQuery(Func<string, bool> predicate) : this()
private SearchQuery(Func<string, bool> predicate)
{
if (predicate == null)
{
throw new ArgumentNullException(nameof(predicate));
}
this.Predicate = predicate;
_predicate = predicate;
}
public static SearchQuery Create(string name, bool ignoreCase)
{
return new SearchQuery(name, ignoreCase ? SearchKind.ExactIgnoreCase : SearchKind.Exact);
}
public static SearchQuery CreateFuzzy(string name)
{
return new SearchQuery(name, SearchKind.Fuzzy);
}
public static SearchQuery CreateCustom(Func<string, bool> predicate)
{
return new SearchQuery(predicate);
}
public Func<string, bool> GetPredicate()
{
return _predicate;
}
}
......@@ -62,7 +126,7 @@ public static Task<IEnumerable<ISymbol>> FindDeclarationsAsync(Project project,
return SpecializedTasks.EmptyEnumerable<ISymbol>();
}
return FindDeclarationsAsync(project, new SearchQuery(name, ignoreCase), includeDirectReferences: true, cancellationToken: cancellationToken);
return FindDeclarationsAsync(project, SearchQuery.Create(name, ignoreCase), includeDirectReferences: true, cancellationToken: cancellationToken);
}
internal static Task<IEnumerable<ISymbol>> FindDeclarationsAsync(
......@@ -87,7 +151,7 @@ public static Task<IEnumerable<ISymbol>> FindDeclarationsAsync(Project project,
return SpecializedTasks.EmptyEnumerable<ISymbol>();
}
return FindDeclarationsAsync(project, new SearchQuery(name, ignoreCase), filter, includeDirectReferences: true, cancellationToken: cancellationToken);
return FindDeclarationsAsync(project, SearchQuery.Create(name, ignoreCase), filter, includeDirectReferences: true, cancellationToken: cancellationToken);
}
internal static Task<IEnumerable<ISymbol>> FindDeclarationsAsync(
......@@ -132,7 +196,7 @@ public static Task<IEnumerable<ISymbol>> FindDeclarationsAsync(Project project,
else
{
await AddDeclarationsAsync(
project.Solution, assembly, GetMetadataReferenceFilePath(compilation.GetMetadataReference(assembly)),
project.Solution, assembly, GetMetadataReferenceFilePath(compilation.GetMetadataReference(assembly)),
query, criteria, list, cancellationToken).ConfigureAwait(false);
}
}
......@@ -169,9 +233,9 @@ private static IEnumerable<ISymbol> TranslateNamespaces(List<ISymbol> symbols, C
Project project, SearchQuery query, SymbolFilter filter, List<ISymbol> list, CancellationToken cancellationToken)
{
await AddDeclarationsAsync(
project, query, filter, list,
startingCompilation: null,
startingAssembly: null,
project, query, filter, list,
startingCompilation: null,
startingAssembly: null,
cancellationToken: cancellationToken).ConfigureAwait(false);
}
......@@ -187,7 +251,7 @@ private static IEnumerable<ISymbol> TranslateNamespaces(List<ISymbol> symbols, C
using (Logger.LogBlock(FunctionId.SymbolFinder_Project_AddDeclarationsAsync, cancellationToken))
using (var set = SharedPools.Default<HashSet<ISymbol>>().GetPooledObject())
{
if (!await project.ContainsSymbolsWithNameAsync(query.Predicate, filter, cancellationToken).ConfigureAwait(false))
if (!await project.ContainsSymbolsWithNameAsync(query.GetPredicate(), filter, cancellationToken).ConfigureAwait(false))
{
return;
}
......@@ -197,12 +261,12 @@ private static IEnumerable<ISymbol> TranslateNamespaces(List<ISymbol> symbols, C
{
// Return symbols from skeleton assembly in this case so that symbols have the same language as startingCompilation.
list.AddRange(
FilterByCriteria(compilation.GetSymbolsWithName(query.Predicate, filter, cancellationToken), filter)
FilterByCriteria(compilation.GetSymbolsWithName(query.GetPredicate(), filter, cancellationToken), filter)
.Select(s => s.GetSymbolKey().Resolve(startingCompilation, cancellationToken: cancellationToken).Symbol).WhereNotNull());
}
else
{
list.AddRange(FilterByCriteria(compilation.GetSymbolsWithName(query.Predicate, filter, cancellationToken), filter));
list.AddRange(FilterByCriteria(compilation.GetSymbolsWithName(query.GetPredicate(), filter, cancellationToken), filter));
}
}
}
......@@ -227,21 +291,28 @@ private static IEnumerable<ISymbol> TranslateNamespaces(List<ISymbol> symbols, C
{
var info = await SymbolTreeInfo.GetInfoForAssemblyAsync(solution, assembly, filePath, cancellationToken).ConfigureAwait(false);
// If the query has a specific string provided, then call into the SymbolTreeInfo
// helpers optimized for lookup based on an exact name.
if (query.Name != null)
{
if (info.HasSymbols(query.Name, query.IgnoreCase))
{
list.AddRange(FilterByCriteria(info.Find(assembly, query.Name, query.IgnoreCase, cancellationToken), filter));
}
}
else
{
list.AddRange(FilterByCriteria(Find(query, info, assembly, cancellationToken), filter));
}
}
private static IEnumerable<ISymbol> Find(SearchQuery query, SymbolTreeInfo info, IAssemblySymbol assembly, CancellationToken cancellationToken)
{
// If the query has a specific string provided, then call into the SymbolTreeInfo
// helpers optimized for lookup based on an exact name.
switch (query.Kind)
{
case SearchKind.Exact:
return info.Find(assembly, query.Name, ignoreCase: false, cancellationToken: cancellationToken);
case SearchKind.ExactIgnoreCase:
return info.Find(assembly, query.Name, ignoreCase: true, cancellationToken: cancellationToken);
case SearchKind.Fuzzy:
return info.FuzzyFind(assembly, query.Name, cancellationToken);
case SearchKind.Custom:
// Otherwise, we'll have to do a slow linear search over all possible symbols.
list.AddRange(FilterByCriteria(info.Find(assembly, query.Predicate, cancellationToken), filter));
}
return info.Find(assembly, query.GetPredicate(), cancellationToken);
}
throw new InvalidOperationException();
}
/// <summary>
......@@ -274,7 +345,7 @@ public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Solution so
using (Logger.LogBlock(FunctionId.SymbolFinder_Solution_Name_FindSourceDeclarationsAsync, cancellationToken))
{
return FindSourceDeclarationsAsyncImpl(solution, new SearchQuery(name, ignoreCase), filter, cancellationToken);
return FindSourceDeclarationsAsyncImpl(solution, SearchQuery.Create(name, ignoreCase), filter, cancellationToken);
}
}
......@@ -327,7 +398,7 @@ public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Project pro
using (Logger.LogBlock(FunctionId.SymbolFinder_Project_Name_FindSourceDeclarationsAsync, cancellationToken))
{
return FindSourceDeclarationsAsyncImpl(project, new SearchQuery(name, ignoreCase), filter, cancellationToken);
return FindSourceDeclarationsAsyncImpl(project, SearchQuery.Create(name, ignoreCase), filter, cancellationToken);
}
}
......@@ -352,7 +423,7 @@ public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Solution so
/// </summary>
public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Solution solution, Func<string, bool> predicate, SymbolFilter filter, CancellationToken cancellationToken = default(CancellationToken))
{
return FindSourceDeclarationsAsync(solution, new SearchQuery(predicate), filter, cancellationToken);
return FindSourceDeclarationsAsync(solution, SearchQuery.CreateCustom(predicate), filter, cancellationToken);
}
internal static async Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Solution solution, SearchQuery query, SymbolFilter filter, CancellationToken cancellationToken)
......@@ -384,7 +455,7 @@ internal static async Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Sol
/// <summary>
/// Find the symbols for declarations made in source with a matching name.
/// </summary>
public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Project project, Func<string,bool> predicate, CancellationToken cancellationToken = default(CancellationToken))
public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Project project, Func<string, bool> predicate, CancellationToken cancellationToken = default(CancellationToken))
{
return FindSourceDeclarationsAsync(project, predicate, SymbolFilter.All, cancellationToken);
}
......@@ -394,7 +465,7 @@ public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Project pro
/// </summary>
public static Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Project project, Func<string, bool> predicate, SymbolFilter filter, CancellationToken cancellationToken = default(CancellationToken))
{
return FindSourceDeclarationsAsync(project, new SearchQuery(predicate), filter, cancellationToken);
return FindSourceDeclarationsAsync(project, SearchQuery.CreateCustom(predicate), filter, cancellationToken);
}
internal static async Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Project project, SearchQuery query, SymbolFilter filter, CancellationToken cancellationToken)
......@@ -412,14 +483,14 @@ internal static async Task<IEnumerable<ISymbol>> FindSourceDeclarationsAsync(Pro
using (Logger.LogBlock(FunctionId.SymbolFinder_Project_Predicate_FindSourceDeclarationsAsync, cancellationToken))
{
var result = new List<ISymbol>();
if (!await project.ContainsSymbolsWithNameAsync(query.Predicate, filter, cancellationToken).ConfigureAwait(false))
if (!await project.ContainsSymbolsWithNameAsync(query.GetPredicate(), filter, cancellationToken).ConfigureAwait(false))
{
return result;
}
var compilation = await project.GetCompilationAsync(cancellationToken).ConfigureAwait(false);
result.AddRange(FilterByCriteria(compilation.GetSymbolsWithName(query.Predicate, filter, cancellationToken), filter));
result.AddRange(FilterByCriteria(compilation.GetSymbolsWithName(query.GetPredicate(), filter, cancellationToken), filter));
return result;
}
}
......@@ -473,4 +544,4 @@ private static bool IsOn(SymbolFilter filter, SymbolFilter flag)
return (filter & flag) == flag;
}
}
}
}
\ No newline at end of file
......@@ -7,6 +7,7 @@
using System.Threading;
using System.Threading.Tasks;
using Roslyn.Utilities;
using static Roslyn.Utilities.PortableShim;
namespace Microsoft.CodeAnalysis.FindSymbols
{
......@@ -16,12 +17,19 @@ internal partial class SymbolTreeInfo
/// <summary>
/// The list of nodes that represent symbols. The primary key into the sorting of this list is the name.
/// They are sorted case-insensitively with the <see cref="s_nodeSortComparer" />. Finding case-sensitive
/// They are sorted case-insensitively with the <see cref="s_totalComparer" />. Finding case-sensitive
/// matches can be found by binary searching for something that matches insensitively, and then searching
/// around that equivalence class for one that matches.
/// </summary>
private readonly IReadOnlyList<Node> _nodes;
/// <summary>
/// The spell checker we use for fuzzy match queries.
/// </summary>
private readonly SpellChecker _spellChecker;
private static readonly StringComparer s_caseInsensitiveComparer = CaseInsensitiveComparison.Comparer;
// We first sort in a case insensitive manner. But, within items that match insensitively,
// we then sort in a case sensitive manner. This helps for searching as we'll walk all
// the items of a specific casing at once. This way features can cache values for that
......@@ -30,47 +38,35 @@ internal partial class SymbolTreeInfo
// they're searching for. However, with this sort of comparison we now get
// "prop, prop, Prop, Prop". Features can take advantage of that by caching their previous
// result and reusing it when they see they're getting the same string again.
private static readonly Comparison<string> s_nodeSortComparer = (s1, s2) =>
private static readonly Comparison<string> s_totalComparer = (s1, s2) =>
{
var diff = CaseInsensitiveComparison.Comparer.Compare(s1, s2);
var diff = s_caseInsensitiveComparer.Compare(s1, s2);
return diff != 0
? diff
: StringComparer.Ordinal.Compare(s1, s2);
};
private static readonly StringComparer s_nodeEquals = CaseInsensitiveComparison.Comparer;
private SymbolTreeInfo(VersionStamp version, IReadOnlyList<Node> orderedNodes)
private SymbolTreeInfo(VersionStamp version, IReadOnlyList<Node> orderedNodes, SpellChecker spellChecker)
{
_version = version;
_nodes = orderedNodes;
_spellChecker = spellChecker;
}
public int Count
{
get { return _nodes.Count; }
}
public int Count => _nodes.Count;
public bool HasSymbols(string name, bool ignoreCase)
{
return FindNodes(name, GetComparer(ignoreCase)).Any();
}
public IEnumerable<ISymbol> Find(IAssemblySymbol assembly, Func<string, bool> predicate, CancellationToken cancellationToken)
/// <summary>
/// Finds symbols in this assembly that match the provided name in a fuzzy manner.
/// </summary>
public IEnumerable<ISymbol> FuzzyFind(IAssemblySymbol assembly, string name, CancellationToken cancellationToken)
{
for (int i = 0, n = _nodes.Count; i < n; i++)
{
cancellationToken.ThrowIfCancellationRequested();
var node = _nodes[i];
if (predicate(node.Name))
{
foreach (var symbol in Bind(i, assembly.GlobalNamespace, cancellationToken))
{
cancellationToken.ThrowIfCancellationRequested();
yield return symbol;
}
}
}
var similarNames = _spellChecker.FindSimilarWords(name);
return similarNames.SelectMany(n => Find(assembly, n, ignoreCase: true, cancellationToken: cancellationToken));
}
/// <summary>
......@@ -95,6 +91,26 @@ public IEnumerable<ISymbol> Find(IAssemblySymbol assembly, Func<string, bool> pr
}
}
/// <summary>
/// Slow, linear scan of all the symbols in this assembly to look for matches.
/// </summary>
public IEnumerable<ISymbol> Find(IAssemblySymbol assembly, Func<string, bool> predicate, CancellationToken cancellationToken)
{
for (int i = 0, n = _nodes.Count; i < n; i++)
{
cancellationToken.ThrowIfCancellationRequested();
var node = _nodes[i];
if (predicate(node.Name))
{
foreach (var symbol in Bind(i, assembly.GlobalNamespace, cancellationToken))
{
cancellationToken.ThrowIfCancellationRequested();
yield return symbol;
}
}
}
}
private static StringComparer GetComparer(bool ignoreCase)
{
return ignoreCase ? CaseInsensitiveComparison.Comparer : StringComparer.Ordinal;
......@@ -117,10 +133,9 @@ private IEnumerable<int> FindNodes(string name, StringComparer comparer)
}
int position = startingPosition;
while (position > 0 && s_nodeEquals.Equals(_nodes[position - 1].Name, name))
while (position > 0 && s_caseInsensitiveComparer.Equals(_nodes[position - 1].Name, name))
{
position--;
if (comparer.Equals(_nodes[position].Name, name))
{
yield return position;
......@@ -128,7 +143,7 @@ private IEnumerable<int> FindNodes(string name, StringComparer comparer)
}
position = startingPosition;
while (position + 1 < _nodes.Count && s_nodeEquals.Equals(_nodes[position + 1].Name, name))
while (position + 1 < _nodes.Count && s_caseInsensitiveComparer.Equals(_nodes[position + 1].Name, name))
{
position++;
if (comparer.Equals(_nodes[position].Name, name))
......@@ -140,7 +155,7 @@ private IEnumerable<int> FindNodes(string name, StringComparer comparer)
}
/// <summary>
/// Searches for a name in the ordered list that matches per the <see cref="s_nodeSortComparer" />.
/// Searches for a name in the ordered list that matches per the <see cref="s_caseInsensitiveComparer" />.
/// </summary>
private int BinarySearch(string name)
{
......@@ -151,7 +166,7 @@ private int BinarySearch(string name)
{
int mid = min + ((max - min) >> 1);
var comparison = s_nodeSortComparer(_nodes[mid].Name, name);
var comparison = s_caseInsensitiveComparer.Compare(_nodes[mid].Name, name);
if (comparison < 0)
{
min = mid + 1;
......@@ -221,7 +236,8 @@ internal static SymbolTreeInfo Create(VersionStamp version, IAssemblySymbol asse
var list = new List<Node>();
GenerateNodes(assembly.GlobalNamespace, list);
return new SymbolTreeInfo(version, SortNodes(list));
var spellChecker = new SpellChecker(list.Select(n => n.Name));
return new SymbolTreeInfo(version, SortNodes(list), spellChecker);
}
private static Node[] SortNodes(List<Node> nodes)
......@@ -263,7 +279,7 @@ private static Node[] SortNodes(List<Node> nodes)
private static int CompareNodes(Node x, Node y, IReadOnlyList<Node> nodeList)
{
var comp = s_nodeSortComparer(x.Name, y.Name);
var comp = s_totalComparer(x.Name, y.Name);
if (comp == 0)
{
if (x.ParentIndex != y.ParentIndex)
......
......@@ -13,7 +13,7 @@ namespace Microsoft.CodeAnalysis.FindSymbols
internal partial class SymbolTreeInfo : IObjectWritable
{
private const string PrefixMetadataSymbolTreeInfo = "<MetadataSymbolTreeInfoPersistence>_";
private const string SerializationFormat = "3";
private const string SerializationFormat = "9";
/// <summary>
/// this is for a metadata reference in a solution
......@@ -93,6 +93,8 @@ public void WriteTo(ObjectWriter writer)
writer.WriteString(node.Name);
writer.WriteInt32(node.ParentIndex);
}
_spellChecker.WriteTo(writer);
}
internal static SymbolTreeInfo ReadFrom(ObjectReader reader)
......@@ -110,7 +112,7 @@ internal static SymbolTreeInfo ReadFrom(ObjectReader reader)
var count = reader.ReadInt32();
if (count == 0)
{
return new SymbolTreeInfo(version, ImmutableArray<Node>.Empty);
return new SymbolTreeInfo(version, ImmutableArray<Node>.Empty, SpellChecker.Empty);
}
var nodes = new Node[count];
......@@ -122,7 +124,7 @@ internal static SymbolTreeInfo ReadFrom(ObjectReader reader)
nodes[i] = new Node(name, parentIndex);
}
return new SymbolTreeInfo(version, nodes);
return new SymbolTreeInfo(version, nodes, SpellChecker.ReadFrom(reader));
}
catch (Exception)
{
......
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities
{
internal struct ArraySlice<T>
{
private readonly T[] _array;
private int _start;
private int _length;
public int Length => _length;
public ArraySlice(T[] array) : this(array, 0, array.Length)
{
}
public ArraySlice(T[] array, TextSpan span) : this(array, span.Start, span.Length)
{
}
public ArraySlice(T[] array, int start, int length) : this()
{
_array = array;
SetStartAndLength(start, length);
}
public T this[int i]
{
get
{
Debug.Assert(i < _length);
return _array[i + _start];
}
}
private void SetStartAndLength(int start, int length)
{
if (start < 0)
{
throw new ArgumentException(nameof(start), $"{start} < {0}");
}
if (start > _array.Length)
{
throw new ArgumentException(nameof(start), $"{start} > {_array.Length}");
}
CheckLength(start, length);
_start = start;
_length = length;
}
private void CheckLength(int start, int length)
{
if (length < 0)
{
throw new ArgumentException(nameof(length), $"{length} < {0}");
}
if (start + length > _array.Length)
{
throw new ArgumentException(nameof(start), $"{start} + {length} > {_array.Length}");
}
}
public void MoveStartForward(int amount)
{
SetStartAndLength(_start + amount, _length - amount);
}
public void SetLength(int length)
{
CheckLength(_start, length);
_length = length;
}
}
}
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities
{
internal partial class BKTree
{
private class Builder
{
// The number of edges we pre-allocate space for for each node in _compactEdges.
//
// To make the comments simpler below, i'll use '4' as a synonym for CompactEdgeAllocationSize.
// '4' simply reads better and makes it clearer what's going on.
private const int CompactEdgeAllocationSize = 4;
// Instead of producing a char[] for each string we're building a node for, we instead
// have one long char[] with all the chracters of each string concatenated. i.e.
// "foo" "bar" and "baz" becomes { f, o, o, b, a, r, b, a, z }. Then in _wordSpans
// we have the text spans for each of those words in this array. This gives us only
// two allocations instead of as many allocations as the number of strings we have.
//
// Once we are done building, we pass this to the BKTree and its nodes also state the
// span of this array that corresponds to the word they were created for. This works
// well as other dependent facilities (like EditDistance) can work on sub-arrays without
// any problems.
private readonly char[] _concatenatedLowerCaseWords;
private readonly TextSpan[] _wordSpans;
// Note: while building a BKTree we have to store children with parents, keyed by the
// edit distance between the two. Naive implementations might store a list or dictionary
// of children along with each node. However, this would be very inefficient and would
// put an enormous amount of memory pressure on the system.
//
// Emperical data for a nice large assembly like mscorlib gives us the following
// information:
//
// Unique-Words (ignoring case): 9662
//
// For each unique word we need a node in the BKTree. If we stored a list or dictionary
// with each node, that would be 10s of thousands of objects created that would then
// just have to be GCed. That's a lot of garbage pressure we'd like to avoid.
//
// Now if we look at all those nodes, we can see the following information about how many
// children each has.
//
// Edge counts:
// 0 5560
// 1 1884
// 2 887
// 3 527
// 4 322
// 5 200
// 6 114
// 7 69
// 8 47
// 9 20
// 10 8
// 11 10
// 12 7
// 13 4
// 15 1
// 16 1
// 54 1
//
//
// i.e. The number of nodes with edge-counts less than or equal to four is: 5560+1884+887+527+322=9180.
// This is 95% of the total number of edges we are adding. Looking at many other dlls
// we found that this ratio stays true across the board. i.e. with all dlls, 95% of nodes
// have 4 or less edges.
//
// So, to optimize things, we pre-alloc a single array with space for 4 edges for each
// node we're going to add. Each node then gets that much space to store edge information.
// If it needs more than that space, then we have a fall-over dictionary that it can store
// information in.
//
// Once building is complete, the GC only needs to deallocate this single array and the
// spillover dictionaries.
//
// This approach produces 1/20th the amount of garbage while building the tree.
//
// Each node at index i has its edges in this array in the range [4*i, 4*i + 4);
private readonly Edge[] _compactEdges;
private readonly BuilderNode[] _builderNodes;
public Builder(IEnumerable<string> values)
{
// TODO(cyrusn): Properly handle unicode normalization here.
var distinctValues = values.Where(v => v.Length > 0).Distinct(CaseInsensitiveComparison.Comparer).ToArray();
var charCount = values.Sum(v => v.Length);
_concatenatedLowerCaseWords = new char[charCount];
_wordSpans = new TextSpan[distinctValues.Length];
var characterIndex = 0;
for (int i = 0; i < distinctValues.Length; i++)
{
var value = distinctValues[i];
_wordSpans[i] = new TextSpan(characterIndex, value.Length);
foreach (var ch in value)
{
_concatenatedLowerCaseWords[characterIndex] = CaseInsensitiveComparison.ToLower(ch);
characterIndex++;
}
}
// We will have one node for each string value that we are adding.
_builderNodes = new BuilderNode[distinctValues.Length];
_compactEdges = new Edge[distinctValues.Length * CompactEdgeAllocationSize];
}
internal BKTree Create()
{
for (var i = 0; i < _wordSpans.Length; i++)
{
Add(_wordSpans[i], insertionIndex: i);
}
var nodes = ImmutableArray.CreateBuilder<Node>(_builderNodes.Length);
// There will be one less edge in the graph than nodes. Each node (except for the
// root) will have a single edge pointing to it.
var edges = ImmutableArray.CreateBuilder<Edge>(Math.Max(0, _builderNodes.Length - 1));
BuildArrays(nodes, edges);
return new BKTree(_concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable());
}
private void BuildArrays(ImmutableArray<Node>.Builder nodes, ImmutableArray<Edge>.Builder edges)
{
var currentEdgeIndex = 0;
for (var i = 0; i < _builderNodes.Length; i++)
{
var builderNode = _builderNodes[i];
var edgeCount = builderNode.EdgeCount;
nodes.Add(new Node(builderNode.CharacterSpan, edgeCount, currentEdgeIndex));
if (edgeCount > 0)
{
// First, copy any edges that are in the compact array.
var start = i * CompactEdgeAllocationSize;
var end = start + Math.Min(edgeCount, CompactEdgeAllocationSize);
for (var j = start; j < end; j++)
{
edges.Add(_compactEdges[j]);
}
// Then, if we've spilled over any edges, copy them as well.
var spilledEdges = builderNode.SpilloverEdges;
if (spilledEdges != null)
{
Debug.Assert(spilledEdges.Count == (edgeCount - CompactEdgeAllocationSize));
foreach (var kvp in spilledEdges)
{
edges.Add(new Edge(kvp.Key, kvp.Value));
}
}
}
currentEdgeIndex += edgeCount;
}
Debug.Assert(currentEdgeIndex == edges.Capacity);
Debug.Assert(currentEdgeIndex == edges.Count);
}
private void Add(TextSpan characterSpan, int insertionIndex)
{
if (insertionIndex == 0)
{
_builderNodes[insertionIndex] = new BuilderNode(characterSpan);
return;
}
var currentNodeIndex = 0;
while (true)
{
var currentNode = _builderNodes[currentNodeIndex];
// Determine the edit distance between these two words. Note: we do not use
// a threshold here as we need the actual edit distance so we can actually
// determine what edge to make or walk.
var editDistance = EditDistance.GetEditDistance(
new ArraySlice<char>(_concatenatedLowerCaseWords, currentNode.CharacterSpan),
new ArraySlice<char>(_concatenatedLowerCaseWords, characterSpan));
if (editDistance == 0)
{
// This should never happen. We dedupe all items before proceeding to the 'Add' step.
// So the edit distance should always be non-zero.
throw new InvalidOperationException();
}
int childNodeIndex;
if (TryGetChildIndex(currentNode, currentNodeIndex, editDistance, out childNodeIndex))
{
// Edit distances collide. Move to this child and add this word to it.
currentNodeIndex = childNodeIndex;
continue;
}
// found the node we want to add the child node to.
AddChildNode(characterSpan, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance);
return;
}
}
private void AddChildNode(
TextSpan characterSpan, int insertionIndex, int currentNodeEdgeCount, int currentNodeIndex, int editDistance)
{
// The node as 'currentNodeIndex' doesn't have an edge with this edit distance.
// Three cases to handle:
// 1) there are less than 4 edges. We simply place the edge into the correct
// location in compactEdges
// 2) there are 4 edges. We need to make the spillover dictionary and then add
// the new edge into that.
// 3) there are more than 4 edges. Just put the new edge in the spillover
// dictionary.
if (currentNodeEdgeCount < CompactEdgeAllocationSize)
{
_compactEdges[currentNodeIndex * CompactEdgeAllocationSize + currentNodeEdgeCount] =
new Edge(editDistance, insertionIndex);
}
else
{
// When we hit 4 elements, we need to allocate the spillover dictionary to
// place the extra edges.
if (currentNodeEdgeCount == CompactEdgeAllocationSize)
{
Debug.Assert(_builderNodes[currentNodeIndex].SpilloverEdges == null);
var spilloverEdges = new Dictionary<int, int>();
_builderNodes[currentNodeIndex].SpilloverEdges = spilloverEdges;
}
_builderNodes[currentNodeIndex].SpilloverEdges.Add(editDistance, insertionIndex);
}
_builderNodes[currentNodeIndex].EdgeCount++;
_builderNodes[insertionIndex] = new BuilderNode(characterSpan);
return;
}
private bool TryGetChildIndex(BuilderNode currentNode, int currentNodeIndex, int editDistance, out int childIndex)
{
// linearly scan the children we have to see if there is one with this edit distance.
var start = currentNodeIndex * CompactEdgeAllocationSize;
var end = start + Math.Min(currentNode.EdgeCount, CompactEdgeAllocationSize);
for (var i = start; i < end; i++)
{
if (_compactEdges[i].EditDistance == editDistance)
{
childIndex = _compactEdges[i].ChildNodeIndex;
return true;
}
}
// If we've spilled over any edges, check there as well
if (currentNode.SpilloverEdges != null)
{
// Can't use the compact array. Have to use the spillover dictionary instead.
Debug.Assert(currentNode.SpilloverEdges.Count == (currentNode.EdgeCount - CompactEdgeAllocationSize));
return currentNode.SpilloverEdges.TryGetValue(editDistance, out childIndex);
}
childIndex = -1;
return false;
}
private struct BuilderNode
{
public readonly TextSpan CharacterSpan;
public int EdgeCount;
public Dictionary<int, int> SpilloverEdges;
public BuilderNode(TextSpan characterSpan) : this()
{
this.CharacterSpan = characterSpan;
}
}
}
}
}
\ No newline at end of file
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
namespace Roslyn.Utilities
{
internal partial class BKTree
{
private struct Edge
{
// The edit distance between the child and parent connected by this edge.
// The child can be found in _nodes at ChildNodeIndex.
public readonly int EditDistance;
/// <summary>Where the child node can be found in <see cref="_nodes"/>.</summary>
public readonly int ChildNodeIndex;
public Edge(int editDistance, int childNodeIndex)
{
EditDistance = editDistance;
ChildNodeIndex = childNodeIndex;
}
internal void WriteTo(ObjectWriter writer)
{
writer.WriteInt32(EditDistance);
writer.WriteInt32(ChildNodeIndex);
}
internal static Edge ReadFrom(ObjectReader reader)
{
return new Edge(editDistance: reader.ReadInt32(), childNodeIndex: reader.ReadInt32());
}
}
}
}
\ No newline at end of file
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities
{
internal partial class BKTree
{
private struct Node
{
/// <summary>
/// The string this node corresponds to. Specifically, this span is the range of
/// <see cref="_concatenatedLowerCaseWords"/> for that string.
/// </summary>
public readonly TextSpan WordSpan;
///<summary>How many child edges this node has.</summary>
public readonly int EdgeCount;
///<summary>Where the first edge can be found in <see cref="_edges"/>. The edges
///are in the range _edges[FirstEdgeIndex, FirstEdgeIndex + EdgeCount)
///</summary>
public readonly int FirstEdgeIndex;
public Node(TextSpan wordSpan, int edgeCount, int firstEdgeIndex)
{
WordSpan = wordSpan;
EdgeCount = edgeCount;
FirstEdgeIndex = firstEdgeIndex;
}
internal void WriteTo(ObjectWriter writer)
{
writer.WriteInt32(WordSpan.Start);
writer.WriteInt32(WordSpan.Length);
writer.WriteInt32(EdgeCount);
writer.WriteInt32(FirstEdgeIndex);
}
internal static Node ReadFrom(ObjectReader reader)
{
return new Node(
new TextSpan(start: reader.ReadInt32(), length: reader.ReadInt32()),
edgeCount: reader.ReadInt32(), firstEdgeIndex: reader.ReadInt32());
}
}
}
}
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Immutable;
namespace Roslyn.Utilities
{
internal partial class BKTree
{
internal void WriteTo(ObjectWriter writer)
{
writer.WriteInt32(_concatenatedLowerCaseWords.Length);
foreach (var c in _concatenatedLowerCaseWords)
{
writer.WriteChar(c);
}
writer.WriteInt32(this._nodes.Length);
foreach (var node in _nodes)
{
node.WriteTo(writer);
}
writer.WriteInt32(this._edges.Length);
foreach (var edge in _edges)
{
edge.WriteTo(writer);
}
}
internal static BKTree ReadFrom(ObjectReader reader)
{
var concatenatedLowerCaseWords = new char[reader.ReadInt32()];
for (var i = 0; i < concatenatedLowerCaseWords.Length; i++)
{
concatenatedLowerCaseWords[i] = reader.ReadChar();
}
var nodeCount = reader.ReadInt32();
var nodes = ImmutableArray.CreateBuilder<Node>(nodeCount);
for (var i = 0; i < nodeCount; i++)
{
nodes.Add(Node.ReadFrom(reader));
}
var edgeCount = reader.ReadInt32();
var edges = ImmutableArray.CreateBuilder<Edge>(edgeCount);
for (var i = 0; i < edgeCount; i++)
{
edges.Add(Edge.ReadFrom(reader));
}
return new BKTree(concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable());
}
}
}
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis;
using Roslyn.Utilities;
using static System.Math;
namespace Roslyn.Utilities
{
/// <summary>
/// NOTE: Only use if you truly need a BK-tree. If you just want to compare words, use
/// the <see cref="SpellChecker"/> type instead.
///
/// An implementation of a Burkhard-Keller tree. Introduced in:
///
/// 'Some approaches to best-match file searching.'
/// Communications of the ACM CACM
/// Volume 16 Issue 4, April 1973
/// Pages 230-236
/// http://dl.acm.org/citation.cfm?doid=362003.362025
/// </summary>
internal partial class BKTree
{
public static readonly BKTree Empty = new BKTree(
SpecializedCollections.EmptyArray<char>(),
ImmutableArray<Node>.Empty,
ImmutableArray<Edge>.Empty);
// We have three completely flat arrays of structs. These arrays fully represent the
// BK tree. The structure is as follows:
//
// The root node is in _nodes[0].
//
// It lists the count of edges it has. These edges are in _edges in the range
// [0*, childCount). Each edge has the index of the child node it points to, and the
// edit distance between the parent and the child.
//
// * of course '0' is only for the root case.
//
// All nodes state where in _edges their child edges range starts, so the children
// for any node are in the range[node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount).
//
// Each node also has an associated string. These strings are concatenated and stored
// in _concatenatedLowerCaseWords. Each node has a TextSpan that indicates which portion
// of the character array is their string. Note: i'd like to use an immutable array
// for the characters as well. However, we need to create slices, and they need to
// work on top of an ArraySlice (which needs a char[]). The edit distance code also
// wants to work on top of raw char[]s (both for speed, and so it can pool arrays
// to prevent lots of garbage). Because of that we just keep this as a char[].
private readonly char[] _concatenatedLowerCaseWords;
private readonly ImmutableArray<Node> _nodes;
private readonly ImmutableArray<Edge> _edges;
private BKTree(char[] concatenatedLowerCaseWords, ImmutableArray<Node> nodes, ImmutableArray<Edge> edges)
{
_concatenatedLowerCaseWords = concatenatedLowerCaseWords;
_nodes = nodes;
_edges = edges;
}
public static BKTree Create(params string[] values)
{
return Create((IEnumerable<string>)values);
}
public static BKTree Create(IEnumerable<string> values)
{
return new Builder(values).Create();
}
public IList<string> Find(string value, int? threshold = null)
{
if (_nodes.Length == 0)
{
return SpecializedCollections.EmptyList<string>();
}
var lowerCaseCharacters = ArrayPool<char>.GetArray(value.Length);
try
{
for (var i = 0; i < value.Length; i++)
{
lowerCaseCharacters[i] = CaseInsensitiveComparison.ToLower(value[i]);
}
threshold = threshold ?? WordSimilarityChecker.GetThreshold(value);
var result = new List<string>();
Lookup(_nodes[0], lowerCaseCharacters, value.Length, threshold.Value, result);
return result;
}
finally
{
ArrayPool<char>.ReleaseArray(lowerCaseCharacters);
}
}
private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, int threshold, List<string> result)
{
// We always want to compute the real edit distance (ignoring any thresholds). This is
// because we need that edit distance to appropriately determine which edges to walk
// in the tree.
var characterSpan = currentNode.WordSpan;
var editDistance = EditDistance.GetEditDistance(
new ArraySlice<char>(_concatenatedLowerCaseWords, characterSpan),
new ArraySlice<char>(queryCharacters, 0, queryLength));
if (editDistance <= threshold)
{
// Found a match.
result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length));
}
var min = editDistance - threshold;
var max = editDistance + threshold;
var startInclusive = currentNode.FirstEdgeIndex;
var endExclusive = startInclusive + currentNode.EdgeCount;
for (var i = startInclusive; i < endExclusive; i++)
{
var childEditDistance = _edges[i].EditDistance;
if (min <= childEditDistance && childEditDistance <= max)
{
Lookup(this._nodes[_edges[i].ChildNodeIndex],
queryCharacters, queryLength, threshold, result);
}
}
}
#if false
// Used for diagnostic purposes.
internal void DumpStats()
{
var sb = new StringBuilder();
sb.AppendLine("Nodes length: " + _nodes.Length);
var childCountHistogram = new Dictionary<int, int>();
foreach (var node in _nodes)
{
var childCount = node.EdgeCount;
int existing;
childCountHistogram.TryGetValue(childCount, out existing);
childCountHistogram[childCount] = existing + 1;
}
sb.AppendLine();
sb.AppendLine("Child counts:");
foreach (var kvp in childCountHistogram.OrderBy(kvp => kvp.Key))
{
sb.AppendLine(kvp.Key + "\t" + kvp.Value);
}
// An item is dense if, starting from 1, at least 80% of it's array would be full.
var densities = new int[11];
var empyCount = 0;
foreach (var node in _nodes)
{
if (node.EdgeCount == 0)
{
empyCount++;
continue;
}
var maxEditDistance = -1;
var startInclusive = node.FirstEdgeIndex;
var endExclusive = startInclusive + node.EdgeCount;
for (var i = startInclusive; i < endExclusive; i++)
{
maxEditDistance = Max(maxEditDistance, _edges[i].EditDistance);
}
var editDistanceCount = node.EdgeCount;
var bucket = 10 * editDistanceCount / maxEditDistance;
densities[bucket]++;
}
var nonEmptyCount = _nodes.Length - empyCount;
sb.AppendLine();
sb.AppendLine("NoChildren: " + empyCount);
sb.AppendLine("AnyChildren: " + nonEmptyCount);
sb.AppendLine("Densities:");
for (var i = 0; i < densities.Length; i++)
{
sb.AppendLine("<=" + i + "0% = " + densities[i] + ", " + ((float)densities[i] / nonEmptyCount));
}
var result = sb.ToString();
}
#endif
}
}
\ No newline at end of file
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
namespace Roslyn.Utilities
{
internal class SpellChecker
{
public static readonly SpellChecker Empty = new SpellChecker(BKTree.Empty);
private readonly BKTree _bkTree;
public SpellChecker(BKTree bKTree)
{
_bkTree = bKTree;
}
public SpellChecker(IEnumerable<string> corpus) : this(BKTree.Create(corpus))
{
}
public IList<string> FindSimilarWords(string value)
{
var result = _bkTree.Find(value, threshold: null);
using (var spellChecker = new WordSimilarityChecker(value))
{
return result.Where(spellChecker.AreSimilar).ToArray();
}
}
internal void WriteTo(ObjectWriter writer)
{
_bkTree.WriteTo(writer);
}
internal static SpellChecker ReadFrom(ObjectReader reader)
{
return new SpellChecker(BKTree.ReadFrom(reader));
}
}
internal class WordSimilarityChecker : IDisposable
{
private struct CacheResult
{
public readonly string CandidateText;
public readonly bool AreSimilar;
public readonly double SimilarityWeight;
public CacheResult(string candidate, bool areSimilar, double similarityWeight)
{
CandidateText = candidate;
AreSimilar = areSimilar;
SimilarityWeight = similarityWeight;
}
}
// Cache the result of the last call to AreSimilar. We'll often be called with the same
// value multiple times in a row, so we can avoid expensive computation by returning the
// same value immediately.
private CacheResult _lastAreSimilarResult;
private string _source;
private EditDistance _editDistance;
private readonly int _threshold;
public WordSimilarityChecker(string text)
{
if (text == null)
{
throw new ArgumentNullException(nameof(text));
}
_source = text;
_threshold = GetThreshold(_source);
_editDistance = new EditDistance(text);
}
public void Dispose()
{
_editDistance.Dispose();
_editDistance = null;
}
public static bool AreSimilar(string originalText, string candidateText)
{
double unused;
return AreSimilar(originalText, candidateText, out unused);
}
/// <summary>
/// Returns true if 'originalText' and 'candidateText' are likely a misspelling of each other.
/// Returns false otherwise. If it is a likely misspelling a similarityWeight is provided
/// to help rank the match. Lower costs mean it was a better match.
/// </summary>
public static bool AreSimilar(string originalText, string candidateText, out double similarityWeight)
{
using (var checker = new WordSimilarityChecker(originalText))
{
return checker.AreSimilar(candidateText, out similarityWeight);
}
}
internal static int GetThreshold(string value)
{
return value.Length <= 4 ? 1 : 2;
}
public bool AreSimilar(string candidateText)
{
double similarityWeight;
return AreSimilar(candidateText, out similarityWeight);
}
public bool AreSimilar(string candidateText, out double similarityWeight)
{
if (_source.Length < 3)
{
// If we're comparing strings that are too short, we'll find
// far too many spurious hits. Don't even bother in this case.
similarityWeight = double.MaxValue;
return false;
}
if (_lastAreSimilarResult.CandidateText == candidateText)
{
similarityWeight = _lastAreSimilarResult.SimilarityWeight;
return _lastAreSimilarResult.AreSimilar;
}
var result = AreSimilarWorker(candidateText, out similarityWeight);
_lastAreSimilarResult = new CacheResult(candidateText, result, similarityWeight);
return result;
}
private bool AreSimilarWorker(string candidateText, out double similarityWeight)
{
similarityWeight = double.MaxValue;
// If the two strings differ by more characters than the cost threshold, then there's
// no point in even computing the edit distance as it would necessarily take at least
// that many additions/deletions.
if (Math.Abs(_source.Length - candidateText.Length) <= _threshold)
{
similarityWeight = _editDistance.GetEditDistance(candidateText, _threshold);
}
if (similarityWeight > _threshold)
{
// it had a high cost. However, the string the user typed was contained
// in the string we're currently looking at. That's enough to consider it
// although we place it just at the threshold (i.e. it's worse than all
// other matches).
if (candidateText.IndexOf(_source, StringComparison.OrdinalIgnoreCase) >= 0)
{
similarityWeight = _threshold;
}
else
{
return false;
}
}
Debug.Assert(similarityWeight <= _threshold);
similarityWeight += Penalty(candidateText, this._source);
return true;
}
private static double Penalty(string candidateText, string originalText)
{
int lengthDifference = Math.Abs(originalText.Length - candidateText.Length);
if (lengthDifference != 0)
{
// For all items of the same edit cost, we penalize those that are
// much longer than the original text versus those that are only
// a little longer.
//
// Note: even with this penalty, all matches of cost 'X' will all still
// cost less than matches of cost 'X + 1'. i.e. the penalty is in the
// range [0, 1) and only serves to order matches of the same cost.
//
// Here's the relation of the first few values of length diff and penalty:
// LengthDiff -> Penalty
// 1 -> .5
// 2 -> .66
// 3 -> .75
// 4 -> .8
// And so on and so forth.
double penalty = 1.0 - (1.0 / (lengthDifference + 1));
return penalty;
}
return 0;
}
}
}
......@@ -118,9 +118,9 @@
<Compile Include="..\..\..\Compilers\Core\Portable\InternalUtilities\ReferenceEqualityComparer.cs">
<Link>InternalUtilities\ReferenceEqualityComparer.cs</Link>
</Compile>
<Compile Include="..\..\..\Compilers\Core\Portable\InternalUtilities\SemaphoreSlimExtensions.cs">
<Link>InternalUtilities\SemaphoreSlimExtensions.cs</Link>
</Compile>
<Compile Include="..\..\..\Compilers\Core\Portable\InternalUtilities\SemaphoreSlimExtensions.cs">
<Link>InternalUtilities\SemaphoreSlimExtensions.cs</Link>
</Compile>
<Compile Include="..\..\..\Compilers\Core\Portable\InternalUtilities\SpecializedCollections.cs">
<Link>InternalUtilities\SpecializedCollections.cs</Link>
</Compile>
......@@ -383,6 +383,8 @@
<Compile Include="ExtensionManager\IErrorReportingService.cs" />
<Compile Include="FindSymbols\DeclaredSymbolInfo.cs" />
<Compile Include="FindSymbols\FindReferences\Finders\ILanguageServiceReferenceFinder.cs" />
<Compile Include="Utilities\ArraySlice.cs" />
<Compile Include="Utilities\BKTree.cs" />
<Compile Include="FindSymbols\SyntaxTree\AbstractSyntaxTreeInfo.cs" />
<Compile Include="FindSymbols\SyntaxTree\SyntaxTreeDeclarationInfo.cs" />
<Compile Include="Formatting\FormattingOptionsProvider.cs" />
......@@ -415,8 +417,12 @@
<Compile Include="Shared\Utilities\XmlFragmentParser.cs" />
<Compile Include="Simplification\SimplificationOptionProvider.cs" />
<Compile Include="Simplification\SimplifyTypeNameCodeAction.cs" />
<Compile Include="Utilities\BKTree.Edge.cs" />
<Compile Include="Utilities\BKTree.Node.cs" />
<Compile Include="Utilities\BKTree.Serialization.cs" />
<Compile Include="Utilities\ForegroundThreadDataKind.cs" />
<Compile Include="Utilities\IReadOnlyDictionaryExtensions.cs" />
<Compile Include="Utilities\SpellChecker.cs" />
<Compile Include="Utilities\ValuesSources\CachedWeakValueSource.cs" />
<Compile Include="Utilities\WeakEventHandler.cs" />
<Compile Include="Versions\Extensions.cs" />
......@@ -948,6 +954,7 @@
<SubType>
</SubType>
</None>
<Compile Include="Utilities\BKTree.Builder.cs" />
<PublicAPI Include="PublicAPI.Shipped.txt" />
<PublicAPI Include="PublicAPI.Unshipped.txt" />
</ItemGroup>
......
......@@ -81,7 +81,9 @@
<Compile Include="CodeCleanup\NormalizeModifiersOrOperatorsTests.cs" />
<Compile Include="CodeCleanup\ReduceTokenTests.cs" />
<Compile Include="CodeCleanup\RemoveUnnecessaryLineContinuationTests.cs" />
<Compile Include="UtilityTest\BKTreeTests.cs" />
<Compile Include="UtilityTest\FilePathUtilitiesTests.cs" />
<Compile Include="UtilityTest\SpellCheckerTests.cs" />
<Compile Include="WorkspaceTests\CommandLineProjectTests.cs" />
<Compile Include="WorkspaceTests\AdhocWorkspaceTests.cs" />
<Compile Include="Differencing\MatchTests.cs" />
......@@ -289,4 +291,4 @@
<Import Project="..\..\..\build\Targets\VSL.Imports.targets" />
<Import Project="..\..\..\build\Targets\Roslyn.Toolsets.Xunit.targets" />
</ImportGroup>
</Project>
</Project>
\ No newline at end of file
using System.Collections.Generic;
using Roslyn.Utilities;
using Xunit;
namespace Microsoft.CodeAnalysis.UnitTests.UtilityTest
{
public class BKTreeTests
{
[Fact]
public void SimpleTests()
{
string[] testValues = { "cook", "book", "books", "cake", "what", "water", "Cape", "Boon", "Cook", "Cart" };
var tree = BKTree.Create(testValues);
var results1 = tree.Find("wat", threshold: 1);
Assert.Single(results1, "what");
var results2 = tree.Find("wat", threshold: 2);
Assert.True(results2.SetEquals(Expected("cart", "what", "water")));
var results3 = tree.Find("caqe", threshold: 1);
Assert.True(results3.SetEquals(Expected("cake", "cape")));
}
[Fact]
public void PermutationTests()
{
string[] testValues = { "cook", "book", "books", "cake", "what", "water", "Cape", "Boon", "Cook", "Cart" };
TestTreeInvariants(testValues);
}
private void TestTreeInvariants(string[] testValues)
{
var tree = BKTree.Create(testValues);
foreach (var value in testValues)
{
// With a threshold of 0, we should only find exactly the item we're searching for.
var items = tree.Find(value, threshold: 0);
Assert.Single(tree.Find(value, threshold: 0), value.ToLower());
}
foreach (var value in testValues)
{
// With a threshold of 1, we should always at least find the item we're looking for.
// But we may also find additional items along with it.
var items = tree.Find(value, threshold: 1);
Assert.Contains(value.ToLower(), items);
// We better not be finding all items.
Assert.NotEqual(testValues.Length, items.Count);
}
foreach (var value in testValues)
{
// If we delete each individual character in each search string, we should still
// find the value in the tree.
for (var i = 0; i < value.Length; i++)
{
var items = tree.Find(Delete(value, i), threshold: null);
Assert.Contains(value.ToLower(), items);
// We better not be finding all items.
Assert.NotEqual(testValues.Length, items.Count);
}
}
foreach (var value in testValues)
{
// If we add a random character at any location in a string, we should still
// be able to find it.
for (var i = 0; i <= value.Length; i++)
{
var items = tree.Find(Insert(value, i, 'Z'), threshold: null);
Assert.Contains(value.ToLower(), items);
// We better not be finding all items.
Assert.NotEqual(testValues.Length, items.Count);
}
}
foreach (var value in testValues)
{
// If we transpose any characters in a string, we should still
// be able to find it.
for (var i = 0; i < value.Length - 1; i++)
{
var items = tree.Find(Transpose(value, i), threshold: null);
Assert.Contains(value.ToLower(), items);
}
}
}
private string Transpose(string value, int i)
{
return value.Substring(0, i) + value[i + 1] + value[i] + value.Substring(i + 2);
}
private string Insert(string value, int i, char v)
{
return value.Substring(0, i) + v + value.Substring(i);
}
private string Delete(string value, int i)
{
return value.Substring(0, i) + value.Substring(i + 1);
}
[Fact]
public void Test2()
{
string[] testValues = { "Leeds", "York", "Bristol", "Leicester", "Hull", "Durham" };
var tree = BKTree.Create(testValues);
var results = tree.Find("hill", threshold: null);
Assert.True(results.SetEquals(Expected("hull")));
results = tree.Find("liecester", threshold: null);
Assert.True(results.SetEquals(Expected("leicester")));
results = tree.Find("leicestre", threshold: null);
Assert.True(results.SetEquals(Expected("leicester")));
results = tree.Find("lecester", threshold: null);
Assert.True(results.SetEquals(Expected("leicester")));
}
[Fact]
public void TestSpillover()
{
string[] testValues = {
/*root:*/ "Four",
/*d=1*/ "Fou", "For", "Fur", "Our", "FourA", "FouAr", "FoAur", "FAour", "AFour", "Tour",
/*d=2*/ "Fo", "Fu", "Fr", "or", "ur", "ou", "FourAb", "FouAbr", "FoAbur", "FAbour", "AbFour", "oFour", "Fuor", "Foru", "ours",
/*d=3*/ "F", "o", "u", "r", "Fob", "Fox", "bur", "urn", "hur", "foraa", "found"
};
TestTreeInvariants(testValues);
}
[Fact]
public void Top1000()
{
TestTreeInvariants(EditDistanceTests.Top1000);
}
private IEnumerable<string> Expected(params string[] values)
{
return values;
}
}
}
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Linq;
using Roslyn.Utilities;
using Xunit;
......@@ -7,87 +9,300 @@ namespace Microsoft.CodeAnalysis.UnitTests
{
public class EditDistanceTests
{
private static int GetEditDistance(string s, string t)
private static void VerifyEditDistance(string s, string t, int expectedEditDistance)
{
return EditDistance.GetEditDistance(s, t, int.MaxValue);
// We want the full edit distance, without bailing out early because we crossed the
// threshold.
var editDistance1 = EditDistance.GetEditDistance(s, t);
Assert.Equal(expectedEditDistance, editDistance1);
// Edit distances are symmetric.
var editDistance2 = EditDistance.GetEditDistance(t, s);
Assert.Equal(editDistance1, editDistance2);
// If we set hte edit distance as our threshold, we should still find the value.
var editDistance3 = EditDistance.GetEditDistance(s, t, editDistance1);
Assert.Equal(editDistance1, editDistance3);
if (editDistance1 > 0)
{
var editDistance4 = EditDistance.GetEditDistance(s, t, editDistance1 - 1);
Assert.Equal(editDistance4, EditDistance.BeyondThreshold);
}
}
[Fact]
public void EditDistance0()
{
Assert.Equal(GetEditDistance("", ""), 0);
Assert.Equal(GetEditDistance("a", "a"), 0);
VerifyEditDistance("", "", 0);
VerifyEditDistance("a", "a", 0);
}
[Fact]
public void EditDistance1()
{
Assert.Equal(GetEditDistance("", "a"), 1);
Assert.Equal(GetEditDistance("a", ""), 1);
Assert.Equal(GetEditDistance("a", "b"), 1);
Assert.Equal(GetEditDistance("ab", "a"), 1);
Assert.Equal(GetEditDistance("a", "ab"), 1);
Assert.Equal(GetEditDistance("aabb", "abab"), 1);
VerifyEditDistance("", "a", 1);
VerifyEditDistance("a", "", 1);
VerifyEditDistance("a", "b", 1);
VerifyEditDistance("ab", "a", 1);
VerifyEditDistance("a", "ab", 1);
VerifyEditDistance("aabb", "abab", 1);
}
[Fact]
public void EditDistance2()
{
Assert.Equal(GetEditDistance("", "aa"), 2);
Assert.Equal(GetEditDistance("aa", ""), 2);
Assert.Equal(GetEditDistance("aa", "bb"), 2);
Assert.Equal(GetEditDistance("aab", "a"), 2);
Assert.Equal(GetEditDistance("a", "aab"), 2);
Assert.Equal(GetEditDistance("aababb", "ababab"), 2);
VerifyEditDistance("", "aa", 2);
VerifyEditDistance("aa", "", 2);
VerifyEditDistance("aa", "bb", 2);
VerifyEditDistance("aab", "a", 2);
VerifyEditDistance("a", "aab", 2);
VerifyEditDistance("aababb", "ababab", 2);
}
[Fact]
public void EditDistance3()
{
Assert.Equal(GetEditDistance("", "aaa"), 3);
Assert.Equal(GetEditDistance("aaa", ""), 3);
Assert.Equal(GetEditDistance("aaa", "bbb"), 3);
Assert.Equal(GetEditDistance("aaab", "a"), 3);
Assert.Equal(GetEditDistance("a", "aaab"), 3);
Assert.Equal(GetEditDistance("aababbab", "abababaa"), 3);
VerifyEditDistance("", "aaa", 3);
VerifyEditDistance("aaa", "", 3);
VerifyEditDistance("aaa", "bbb", 3);
VerifyEditDistance("aaab", "a", 3);
VerifyEditDistance("a", "aaab", 3);
VerifyEditDistance("aababbab", "abababaa", 3);
}
[Fact]
public void EditDistance4()
{
Assert.Equal(GetEditDistance("XlmReade", "XmlReader"), 2);
VerifyEditDistance("XlmReade", "XmlReader", 2);
}
public void EditDistance5()
{
VerifyEditDistance("Zeil", "trials", 4);
}
[Fact]
public void EditDistance6()
{
VerifyEditDistance("barking", "corkliness", 6);
}
[Fact]
public void EditDistance7()
{
VerifyEditDistance("kitten", "sitting", 3);
}
[Fact]
public void MoreEditDistance()
public void EditDistance8()
{
Assert.Equal(GetEditDistance("barking", "corkliness"), 6);
VerifyEditDistance("sunday", "saturday", 3);
}
[Fact]
public void TestCloseMatch()
public void EditDistance9()
{
Assert.True(EditDistance.IsCloseMatch("variabledeclaratorsyntax", "variabledeclaratorsyntaxextensions"));
VerifyEditDistance("meilenstein", "levenshtein", 4);
}
Assert.True(EditDistance.IsCloseMatch("expressionsyntax", "expressionsyntaxextensions"));
Assert.True(EditDistance.IsCloseMatch("expressionsyntax", "expressionsyntaxgeneratorvisitor"));
[Fact]
public void EditDistance10()
{
VerifyEditDistance("rosettacode", "raisethysword", 8);
}
[Fact]
public void TestNotCloseMatch()
public void EditDistance11()
{
Assert.False(EditDistance.IsCloseMatch("propertyblocksyntax", "ipropertysymbol"));
Assert.False(EditDistance.IsCloseMatch("propertyblocksyntax", "ipropertysymbolextensions"));
Assert.False(EditDistance.IsCloseMatch("propertyblocksyntax", "typeblocksyntaxextensions"));
var editDistance = EditDistance.GetEditDistance("book", "moons", 1);
Assert.Equal(editDistance, EditDistance.BeyondThreshold);
VerifyEditDistance("book", "moons", 3);
}
[Fact]
public void EditDistance12()
{
VerifyEditDistance("aaaab", "aaabc", 2);
VerifyEditDistance("aaaab", "aabcc", 3);
VerifyEditDistance("aaaab", "abccc", 4);
VerifyEditDistance("aaaab", "bcccc", 5);
VerifyEditDistance("aaaabb", "aaabbc", 2);
VerifyEditDistance("aaaabb", "aabbcc", 4);
VerifyEditDistance("aaaabb", "abbccc", 5);
VerifyEditDistance("aaaabb", "bbcccc", 6);
VerifyEditDistance("aaaabbb", "aaabbbc", 2);
VerifyEditDistance("aaaabbb", "aabbbcc", 4);
VerifyEditDistance("aaaabbb", "abbbccc", 6);
VerifyEditDistance("aaaabbb", "bbbcccc", 7);
VerifyEditDistance("aaaabbbb", "aaabbbbc", 2);
VerifyEditDistance("aaaabbbb", "aabbbbcc", 4);
VerifyEditDistance("aaaabbbb", "abbbbccc", 6);
VerifyEditDistance("aaaabbbb", "bbbbcccc", 8);
}
public static readonly string[] Top1000 = new string[]
{
"a","able","about","above","act","add","afraid","after","again","against","age","ago","agree","air","all",
"allow","also","always","am","among","an","and","anger","animal","answer","any","appear","apple","are",
"area","arm","arrange","arrive","art","as","ask","at","atom","baby","back","bad","ball","band","bank",
"bar","base","basic","bat","be","bear","beat","beauty","bed","been","before","began","begin","behind",
"believe","bell","best","better","between","big","bird","bit","black","block","blood","blow","blue","board",
"boat","body","bone","book","born","both","bottom","bought","box","boy","branch","bread","break","bright",
"bring","broad","broke","brother","brought","brown","build","burn","busy","but","buy","by","call","came",
"camp","can","capital","captain","car","card","care","carry","case","cat","catch","caught","cause","cell",
"cent","center","century","certain","chair","chance","change","character","charge","chart","check","chick",
"chief","child","children","choose","chord","circle","city","claim","class","clean","clear","climb","clock",
"close","clothe","cloud","coast","coat","cold","collect","colony","color","column","come","common","company",
"compare","complete","condition","connect","consider","consonant","contain","continent","continue","control",
"cook","cool","copy","corn","corner","correct","cost","cotton","could","count","country","course","cover",
"cow","crease","create","crop","cross","crowd","cry","current","cut","dad","dance","danger","dark","day",
"dead","deal","dear","death","decide","decimal","deep","degree","depend","describe","desert","design",
"determine","develop","dictionary","did","die","differ","difficult","direct","discuss","distant","divide",
"division","do","doctor","does","dog","dollar","done","dont","door","double","down","draw","dream","dress",
"drink","drive","drop","dry","duck","during","each","ear","early","earth","ease","east","eat","edge",
"effect","egg","eight","either","electric","element","else","end","enemy","energy","engine","enough",
"enter","equal","equate","especially","even","evening","event","ever","every","exact","example","except",
"excite","exercise","expect","experience","experiment","eye","face","fact","fair","fall","family","famous",
"far","farm","fast","fat","father","favor","fear","feed","feel","feet","fell","felt","few","field","fig",
"fight","figure","fill","final","find","fine","finger","finish","fire","first","fish","fit","five","flat",
"floor","flow","flower","fly","follow","food","foot","for","force","forest","form","forward","found",
"four","fraction","free","fresh","friend","from","front","fruit","full","fun","game","garden","gas","gather",
"gave","general","gentle","get","girl","give","glad","glass","go","gold","gone","good","got","govern",
"grand","grass","gray","great","green","grew","ground","group","grow","guess","guide","gun","had","hair",
"half","hand","happen","happy","hard","has","hat","have","he","head","hear","heard","heart","heat","heavy",
"held","help","her","here","high","hill","him","his","history","hit","hold","hole","home","hope","horse",
"hot","hour","house","how","huge","human","hundred","hunt","hurry","i","ice","idea","if","imagine","in",
"inch","include","indicate","industry","insect","instant","instrument","interest","invent","iron","is",
"island","it","job","join","joy","jump","just","keep","kept","key","kill","kind","king","knew","know",
"lady","lake","land","language","large","last","late","laugh","law","lay","lead","learn","least","leave",
"led","left","leg","length","less","let","letter","level","lie","life","lift","light","like","line","liquid",
"list","listen","little","live","locate","log","lone","long","look","lost","lot","loud","love","low",
"machine","made","magnet","main","major","make","man","many","map","mark","market","mass","master","match",
"material","matter","may","me","mean","meant","measure","meat","meet","melody","men","metal","method",
"middle","might","mile","milk","million","mind","mine","minute","miss","mix","modern","molecule","moment",
"money","month","moon","more","morning","most","mother","motion","mount","mountain","mouth","move","much",
"multiply","music","must","my","name","nation","natural","nature","near","necessary","neck","need","neighbor",
"never","new","next","night","nine","no","noise","noon","nor","north","nose","note","nothing","notice",
"noun","now","number","numeral","object","observe","occur","ocean","of","off","offer","office","often",
"oh","oil","old","on","once","one","only","open","operate","opposite","or","order","organ","original",
"other","our","out","over","own","oxygen","page","paint","pair","paper","paragraph","parent","part","particular",
"party","pass","past","path","pattern","pay","people","perhaps","period","person","phrase","pick","picture",
"piece","pitch","place","plain","plan","plane","planet","plant","play","please","plural","poem","point",
"poor","populate","port","pose","position","possible","post","pound","power","practice","prepare","present",
"press","pretty","print","probable","problem","process","produce","product","proper","property","protect",
"prove","provide","pull","push","put","quart","question","quick","quiet","quite","quotient","race","radio",
"rail","rain","raise","ran","range","rather","reach","read","ready","real","reason","receive","record",
"red","region","remember","repeat","reply","represent","require","rest","result","rich","ride","right",
"ring","rise","river","road","rock","roll","room","root","rope","rose","round","row","rub","rule","run",
"safe","said","sail","salt","same","sand","sat","save","saw","say","scale","school","science","score",
"sea","search","season","seat","second","section","see","seed","seem","segment","select","self","sell",
"send","sense","sent","sentence","separate","serve","set","settle","seven","several","shall","shape",
"share","sharp","she","sheet","shell","shine","ship","shoe","shop","shore","short","should","shoulder",
"shout","show","side","sight","sign","silent","silver","similar","simple","since","sing","single","sister",
"sit","six","size","skill","skin","sky","slave","sleep","slip","slow","small","smell","smile","snow",
"so","soft","soil","soldier","solution","solve","some","son","song","soon","sound","south","space","speak",
"special","speech","speed","spell","spend","spoke","spot","spread","spring","square","stand","star","start",
"state","station","stay","stead","steam","steel","step","stick","still","stone","stood","stop","store",
"story","straight","strange","stream","street","stretch","string","strong","student","study","subject",
"substance","subtract","success","such","sudden","suffix","sugar","suggest","suit","summer","sun","supply",
"support","sure","surface","surprise","swim","syllable","symbol","system","table","tail","take","talk",
"tall","teach","team","teeth","tell","temperature","ten","term","test","than","thank","that","the","their",
"them","then","there","these","they","thick","thin","thing","think","third","this","those","though","thought",
"thousand","three","through","throw","thus","tie","time","tiny","tire","to","together","told","tone",
"too","took","tool","top","total","touch","toward","town","track","trade","train","travel","tree","triangle",
"trip","trouble","truck","true","try","tube","turn","twenty","two","type","under","unit","until","up",
"us","use","usual","valley","value","vary","verb","very","view","village","visit","voice","vowel","wait",
"walk","wall","want","war","warm","was","wash","watch","water","wave","way","we","wear","weather","week",
"weight","well","went","were","west","what","wheel","when","where","whether","which","while","white",
"who","whole","whose","why","wide","wife","wild","will","win","wind","window","wing","winter","wire",
"wish","with","woman","women","wonder","wont","wood","word","work","world","would","write","written",
"wrong","wrote","yard","year","yellow","yes","yet","you","young","your",
};
[Fact]
public void Top1000Test()
{
for (var i = 0; i < Top1000.Length; i++)
{
var source = Top1000[i];
for (var j = 0; j < Top1000.Length; j++)
{
var target = Top1000[j];
var editDistance1 = EditDistance.GetEditDistance(source, target);
if (i == j)
{
Assert.Equal(0, editDistance1);
}
if (editDistance1 == 0)
{
Assert.Equal(i, j);
}
Assert.True(editDistance1 >= 0);
var editDistance2 = EditDistance.GetEditDistance(source, target, editDistance1);
Assert.Equal(editDistance1, editDistance2);
}
}
}
[Fact]
public void TestSpecificMetric()
{
// If our edit distance is a metric then ED(CA,ABC) = 2 because CA -> AC -> ABC
// In this case. This then satisifes the triangle inequality because
// ED(CA, AC) + ED(AC, ABC) >= ED(CA, ABC) ... 1 + 1 >= 2
//
// If it's not implemented with a metric (like if we used the Optimal String Alignment
// algorithm), then the we could get an edit distance of 3 "CA -> A -> AB -> ABC".
// This violates the triangle inequality rule because:
//
// OSA(CA,AC) + OSA(AC,ABC) >= OSA(CA,ABC) ... 1 + 1 >= 3 is not true.
//
// Being a metric is important so that we can properly use this with BKTrees.
VerifyEditDistance("CA", "ABC", 2);
}
[Fact]
public void TestTriangleInequality()
{
var top = Top1000.Take(50).ToArray();
for (var i = 0; i < top.Length; i++)
{
for (var j = 0; j < top.Length; j++)
{
if (j == i)
{
continue;
}
for (var k = 0; k < top.Length; k++)
{
if (k == i || k == j)
{
continue;
}
Assert.False(EditDistance.IsCloseMatch("fielddeclarationsyntax", "declarationinfo"));
Assert.False(EditDistance.IsCloseMatch("fielddeclarationsyntax", "declarationcomputer"));
Assert.False(EditDistance.IsCloseMatch("fielddeclarationsyntax", "filelinepositionspan"));
var string1 = top[i];
var string2 = top[j];
var string3 = top[k];
Assert.False(EditDistance.IsCloseMatch("variabledeclaratorsyntax", "visualbasicdeclarationcomputer"));
Assert.False(EditDistance.IsCloseMatch("variabledeclaratorsyntax", "ilineseparatorservice"));
var editDistance12 = EditDistance.GetEditDistance(string1, string2);
var editDistance13 = EditDistance.GetEditDistance(string1, string3);
var editDistance23 = EditDistance.GetEditDistance(string2, string3);
Assert.False(EditDistance.IsCloseMatch("expressionsyntax", "awaitexpressioninfo"));
Assert.True(editDistance13 <= editDistance12 + editDistance23);
}
}
}
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Roslyn.Utilities;
using Xunit;
namespace Microsoft.CodeAnalysis.UnitTests.UtilityTest
{
public class WordSimilarityCheckerTests
{
[Fact]
public void TestCloseMatch()
{
Assert.True(WordSimilarityChecker.AreSimilar("variabledeclaratorsyntax", "variabledeclaratorsyntaxextensions"));
Assert.True(WordSimilarityChecker.AreSimilar("expressionsyntax", "expressionsyntaxextensions"));
Assert.True(WordSimilarityChecker.AreSimilar("expressionsyntax", "expressionsyntaxgeneratorvisitor"));
}
[Fact]
public void TestNotCloseMatch()
{
Assert.False(WordSimilarityChecker.AreSimilar("propertyblocksyntax", "ipropertysymbol"));
Assert.False(WordSimilarityChecker.AreSimilar("propertyblocksyntax", "ipropertysymbolextensions"));
Assert.False(WordSimilarityChecker.AreSimilar("propertyblocksyntax", "typeblocksyntaxextensions"));
Assert.False(WordSimilarityChecker.AreSimilar("fielddeclarationsyntax", "declarationinfo"));
Assert.False(WordSimilarityChecker.AreSimilar("fielddeclarationsyntax", "declarationcomputer"));
Assert.False(WordSimilarityChecker.AreSimilar("fielddeclarationsyntax", "filelinepositionspan"));
Assert.False(WordSimilarityChecker.AreSimilar("variabledeclaratorsyntax", "visualbasicdeclarationcomputer"));
Assert.False(WordSimilarityChecker.AreSimilar("variabledeclaratorsyntax", "ilineseparatorservice"));
Assert.False(WordSimilarityChecker.AreSimilar("expressionsyntax", "awaitexpressioninfo"));
}
}
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册