diff --git a/src/Workspaces/Core/Portable/Utilities/BKTree.Builder.cs b/src/Workspaces/Core/Portable/Utilities/BKTree.Builder.cs index 5e7ea65eabb94edd15aa35f675cbb82dedac6cb7..e765d4c9a41db7f4fb155add5cb4deb533b13fd4 100644 --- a/src/Workspaces/Core/Portable/Utilities/BKTree.Builder.cs +++ b/src/Workspaces/Core/Portable/Utilities/BKTree.Builder.cs @@ -1,13 +1,10 @@ // Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; -using System.Collections; using System.Collections.Generic; using System.Collections.Immutable; using System.Diagnostics; using System.Linq; -using System.Text; -using System.Threading.Tasks; using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.Text; @@ -19,8 +16,18 @@ private class Builder { private const int CompactEdgeAllocationSize = 4; - private readonly char[] _allLowerCaseCharacters; - private readonly TextSpan[] _characterSpans; + // Instead of producing a char[] for each string we're building a node for, we instead + // have one long char[] with all the chracters of each string concatenated. i.e. + // "foo" "bar" and "baz" becomes { f, o, o, b, a, r, b, a, z }. Then in _wordSpans + // we have the text spans for each of those words in this array. This gives us only + // two allocations instead of as many allocations as the number of strings we have. + // + // Once we are done building, we pass this to the BKTree and its nodes also state the + // span of this array that corresponds to the word they were created for. This works + // well as other dependent facilities (like EditDistance) can work on sub-arrays without + // any problems. + private readonly char[] _concatenatedLowerCaseWords; + private readonly TextSpan[] _wordSpans; // Note: while building a BKTree we have to store children with parents, keyed by the // edit distance between the two. Naive implementations might store a list or dictionary @@ -83,18 +90,18 @@ public Builder(IEnumerable values) var distinctValues = values.Where(v => v.Length > 0).Distinct(CaseInsensitiveComparison.Comparer).ToArray(); var charCount = Enumerable.Sum(values,(Func)(v => (int)v.Length)); - _allLowerCaseCharacters = new char[charCount]; - _characterSpans = new TextSpan[distinctValues.Length]; + _concatenatedLowerCaseWords = new char[charCount]; + _wordSpans = new TextSpan[distinctValues.Length]; var characterIndex = 0; for (int i = 0; i < distinctValues.Length; i++) { var value = distinctValues[i]; - _characterSpans[i] = new TextSpan(characterIndex, value.Length); + _wordSpans[i] = new TextSpan(characterIndex, value.Length); foreach (var ch in value) { - _allLowerCaseCharacters[characterIndex] = char.ToLower(ch); + _concatenatedLowerCaseWords[characterIndex] = char.ToLower(ch); characterIndex++; } } @@ -106,9 +113,9 @@ public Builder(IEnumerable values) internal BKTree Create() { - for (var i = 0; i < _characterSpans.Length; i++) + for (var i = 0; i < _wordSpans.Length; i++) { - Add(_characterSpans[i], insertionIndex: i); + Add(_wordSpans[i], insertionIndex: i); } var nodes = ImmutableArray.CreateBuilder(_builderNodes.Length); @@ -119,7 +126,7 @@ internal BKTree Create() BuildArrays(nodes, edges); - return new BKTree(_allLowerCaseCharacters, nodes.MoveToImmutable(), edges.MoveToImmutable()); + return new BKTree(_concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable()); } private void BuildArrays(ImmutableArray.Builder nodes, ImmutableArray.Builder edges) @@ -136,7 +143,7 @@ private void BuildArrays(ImmutableArray.Builder nodes, ImmutableArray(_allLowerCaseCharacters, currentNode.CharacterSpan), - new ArraySlice(_allLowerCaseCharacters, characterSpan)); + new ArraySlice(_concatenatedLowerCaseWords, currentNode.CharacterSpan), + new ArraySlice(_concatenatedLowerCaseWords, characterSpan)); if (editDistance == 0) { diff --git a/src/Workspaces/Core/Portable/Utilities/BKTree.Edge.cs b/src/Workspaces/Core/Portable/Utilities/BKTree.Edge.cs index 4a2445fd0c3cf3c3e27c964229602eb2a58e90e9..7d39fba40a9fb9e8cfdf91551978349f68290a37 100644 --- a/src/Workspaces/Core/Portable/Utilities/BKTree.Edge.cs +++ b/src/Workspaces/Core/Portable/Utilities/BKTree.Edge.cs @@ -1,11 +1,5 @@ // Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - namespace Roslyn.Utilities { internal partial class BKTree diff --git a/src/Workspaces/Core/Portable/Utilities/BKTree.Node.cs b/src/Workspaces/Core/Portable/Utilities/BKTree.Node.cs index b554fed311f1a7dce70dddc3b7f4ca32d41d3d22..e7492ac5c34eb3e66d0851621c96e0ce8a0d0732 100644 --- a/src/Workspaces/Core/Portable/Utilities/BKTree.Node.cs +++ b/src/Workspaces/Core/Portable/Utilities/BKTree.Node.cs @@ -9,8 +9,8 @@ internal partial class BKTree private struct Node { // The string this node corresponds to. Specifically, this span is the range of - // _allLowerCaseCharacters for that string. - public readonly TextSpan CharacterSpan; + // _concatenatedLowerCaseWords for that string. + public readonly TextSpan WordSpan; // How many children/edges this node has. public readonly int EdgeCount; @@ -19,17 +19,17 @@ private struct Node // _edges[FirstEdgeIndex, FirstEdgeIndex + EdgeCount) public readonly int FirstEdgeIndex; - public Node(TextSpan characterSpan, int edgeCount, int firstEdgeIndex) + public Node(TextSpan wordSpan, int edgeCount, int firstEdgeIndex) { - CharacterSpan = characterSpan; + WordSpan = wordSpan; EdgeCount = edgeCount; FirstEdgeIndex = firstEdgeIndex; } internal void WriteTo(ObjectWriter writer) { - writer.WriteInt32(CharacterSpan.Start); - writer.WriteInt32(CharacterSpan.Length); + writer.WriteInt32(WordSpan.Start); + writer.WriteInt32(WordSpan.Length); writer.WriteInt32(EdgeCount); writer.WriteInt32(FirstEdgeIndex); } diff --git a/src/Workspaces/Core/Portable/Utilities/BKTree.Serialization.cs b/src/Workspaces/Core/Portable/Utilities/BKTree.Serialization.cs index 4dd97029f77e68f596fc5675afc96306a3cd4dca..946bbc6238f9222c2439cbfb6edf0ab9295d3cde 100644 --- a/src/Workspaces/Core/Portable/Utilities/BKTree.Serialization.cs +++ b/src/Workspaces/Core/Portable/Utilities/BKTree.Serialization.cs @@ -8,8 +8,8 @@ internal partial class BKTree { internal void WriteTo(ObjectWriter writer) { - writer.WriteInt32(_allLowerCaseCharacters.Length); - foreach (var c in _allLowerCaseCharacters) + writer.WriteInt32(_concatenatedLowerCaseWords.Length); + foreach (var c in _concatenatedLowerCaseWords) { writer.WriteChar(c); } @@ -29,10 +29,10 @@ internal void WriteTo(ObjectWriter writer) internal static BKTree ReadFrom(ObjectReader reader) { - var allLowerCaseCharacters = new char[reader.ReadInt32()]; - for (var i = 0; i < allLowerCaseCharacters.Length; i++) + var concatenatedLowerCaseWords = new char[reader.ReadInt32()]; + for (var i = 0; i < concatenatedLowerCaseWords.Length; i++) { - allLowerCaseCharacters[i] = reader.ReadChar(); + concatenatedLowerCaseWords[i] = reader.ReadChar(); } var nodeCount = reader.ReadInt32(); @@ -49,7 +49,7 @@ internal static BKTree ReadFrom(ObjectReader reader) edges.Add(Edge.ReadFrom(reader)); } - return new BKTree(allLowerCaseCharacters, nodes.MoveToImmutable(), edges.MoveToImmutable()); + return new BKTree(concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable()); } } } diff --git a/src/Workspaces/Core/Portable/Utilities/BKTree.cs b/src/Workspaces/Core/Portable/Utilities/BKTree.cs index 959000c8e14c00955d8ba051167db1cc2cdecab8..2e8a738f6ea27d0d20fe41c24f74157d22aa5eeb 100644 --- a/src/Workspaces/Core/Portable/Utilities/BKTree.cs +++ b/src/Workspaces/Core/Portable/Utilities/BKTree.cs @@ -33,19 +33,19 @@ internal partial class BKTree // [node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount) // // Each node also has an associated string. These strings are concatenated and stored - // in _allLowerCaseCharacters. Each node has a TextSpan that indicates which portion + // in _concatenatedLowerCaseWords. Each node has a TextSpan that indicates which portion // of the character array is their string. Note: i'd like to use an immutable array // for the characters as well. However, we need to create slices, and they need to // work on top of an ArraySlice (which needs a char[]). The edit distance code also // wants to work on top of raw char[]s (both for speed, and so it can pool arrays // to prevent lots of garbage). Because of that we just keep this as a char[]. - private readonly char[] _allLowerCaseCharacters; + private readonly char[] _concatenatedLowerCaseWords; private readonly ImmutableArray _nodes; private readonly ImmutableArray _edges; - private BKTree(char[] allLowerCaseCharacters, ImmutableArray nodes, ImmutableArray edges) + private BKTree(char[] concatenatedLowerCaseWords, ImmutableArray nodes, ImmutableArray edges) { - _allLowerCaseCharacters = allLowerCaseCharacters; + _concatenatedLowerCaseWords = concatenatedLowerCaseWords; _nodes = nodes; _edges = edges; } @@ -91,15 +91,15 @@ private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, i // We always want to compute the real edit distance (ignoring any thresholds). This is // because we need that edit distance to appropriately determine which edges to walk // in the tree. - var characterSpan = currentNode.CharacterSpan; + var characterSpan = currentNode.WordSpan; var editDistance = EditDistance.GetEditDistance( - new ArraySlice(_allLowerCaseCharacters, characterSpan), + new ArraySlice(_concatenatedLowerCaseWords, characterSpan), new ArraySlice(queryCharacters, 0, queryLength)); if (editDistance <= threshold) { // Found a match. - result.Add(new string(_allLowerCaseCharacters, characterSpan.Start, characterSpan.Length)); + result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length)); } var min = editDistance - threshold;