提交 eacf19dc 编写于 作者: C Cyrus Najmabadi

Use a flat contiguous array of characters for the BKTree instead of thousands of small char arrays.

上级 48589ea8
......@@ -13,7 +13,7 @@ namespace Microsoft.CodeAnalysis.FindSymbols
internal partial class SymbolTreeInfo : IObjectWritable
{
private const string PrefixMetadataSymbolTreeInfo = "<MetadataSymbolTreeInfoPersistence>_";
private const string SerializationFormat = "5";
private const string SerializationFormat = "6";
/// <summary>
/// this is for a metadata reference in a solution
......
......@@ -4,6 +4,7 @@
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities
{
......@@ -19,6 +20,10 @@ public ArraySlice(T[] array) : this(array, 0, array.Length)
{
}
public ArraySlice(T[] array, TextSpan span) : this(array, span.Start, span.Length)
{
}
public ArraySlice(T[] array, int start, int length) : this()
{
_array = array;
......
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities
{
......@@ -13,7 +16,8 @@ private class Builder
{
private const int CompactEdgeAllocationSize = 4;
private readonly char[][] _values;
private readonly char[] _allLowerCaseCharacters;
private readonly TextSpan[] _characterSpans;
// Note: while building a BKTree we have to store children with parents, keyed by the
// edit distance between the two. Naive implementations might store a list or dictionary
......@@ -73,21 +77,35 @@ private class Builder
public Builder(IEnumerable<string> values)
{
_values = values.Select(v => v.ToLower())
.Distinct()
.Select(v => v.ToCharArray())
.Where(a => a.Length > 0).ToArray();
var distinctValues = values.Where(v => v.Length > 0).Distinct(CaseInsensitiveComparison.Comparer).ToArray();
var charCount = Enumerable.Sum<string>(values,(Func<string, int>)(v => (int)v.Length));
_allLowerCaseCharacters = new char[charCount];
_characterSpans = new TextSpan[distinctValues.Length];
var characterIndex = 0;
for (int i = 0; i < distinctValues.Length; i++)
{
var value = distinctValues[i];
_characterSpans[i] = new TextSpan(characterIndex, value.Length);
foreach (var ch in value)
{
_allLowerCaseCharacters[characterIndex] = char.ToLower(ch);
characterIndex++;
}
}
// We will have one node for each string value that we are adding.
_builderNodes = new BuilderNode[_values.Length];
_compactEdges = new Edge[_values.Length * CompactEdgeAllocationSize];
_builderNodes = new BuilderNode[distinctValues.Length];
_compactEdges = new Edge[distinctValues.Length * CompactEdgeAllocationSize];
}
internal BKTree Create()
{
for (var i = 0; i < _values.Length; i++)
for (var i = 0; i < _characterSpans.Length; i++)
{
Add(_values[i], insertionIndex: i);
Add(_characterSpans[i], insertionIndex: i);
}
var nodes = new Node[_builderNodes.Length];
......@@ -98,7 +116,7 @@ internal BKTree Create()
BuildArrays(nodes, edges);
return new BKTree(nodes, edges);
return new BKTree(_allLowerCaseCharacters, nodes, edges);
}
private void BuildArrays(Node[] nodes, Edge[] edges)
......@@ -109,8 +127,7 @@ private void BuildArrays(Node[] nodes, Edge[] edges)
var builderNode = _builderNodes[i];
var edgeCount = builderNode.EdgeCount;
nodes[i] = new Node(
builderNode.LowerCaseCharacters, edgeCount, currentEdgeIndex);
nodes[i] = new Node(builderNode.CharacterSpan, edgeCount, currentEdgeIndex);
if (edgeCount > 0)
{
......@@ -140,11 +157,11 @@ private void BuildArrays(Node[] nodes, Edge[] edges)
Debug.Assert(currentEdgeIndex == edges.Length);
}
private void Add(char[] lowerCaseCharacters, int insertionIndex)
private void Add(TextSpan characterSpan, int insertionIndex)
{
if (insertionIndex == 0)
{
_builderNodes[insertionIndex] = new BuilderNode(lowerCaseCharacters);
_builderNodes[insertionIndex] = new BuilderNode(characterSpan);
return;
}
......@@ -156,7 +173,9 @@ private void Add(char[] lowerCaseCharacters, int insertionIndex)
// Determine the edit distance between these two words. Note: we do not use
// a threshold here as we need the actual edit distance so we can actually
// determine what edge to make or walk.
var editDistance = EditDistance.GetEditDistance(currentNode.LowerCaseCharacters, lowerCaseCharacters);
var editDistance = EditDistance.GetEditDistance(
new ArraySlice<char>(_allLowerCaseCharacters, currentNode.CharacterSpan),
new ArraySlice<char>(_allLowerCaseCharacters, characterSpan));
if (editDistance == 0)
{
......@@ -174,13 +193,13 @@ private void Add(char[] lowerCaseCharacters, int insertionIndex)
}
// found the node we want to add the child node to.
AddChildNode(lowerCaseCharacters, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance);
AddChildNode(characterSpan, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance);
return;
}
}
private void AddChildNode(
char[] lowerCaseCharacters, int insertionIndex, int currentNodeEdgeCount, int currentNodeIndex, int editDistance)
TextSpan characterSpan, int insertionIndex, int currentNodeEdgeCount, int currentNodeIndex, int editDistance)
{
// Node doesn't have an edge with this edit distance. Three cases to handle:
// 1) there are less than 4 edges. We simply place the edge into the correct
......@@ -215,7 +234,7 @@ private void Add(char[] lowerCaseCharacters, int insertionIndex)
}
_builderNodes[currentNodeIndex].EdgeCount++;
_builderNodes[insertionIndex] = new BuilderNode(lowerCaseCharacters);
_builderNodes[insertionIndex] = new BuilderNode(characterSpan);
return;
}
......@@ -247,13 +266,13 @@ private bool TryGetChildIndex(BuilderNode currentNode, int currentNodeIndex, int
private struct BuilderNode
{
public readonly char[] LowerCaseCharacters;
public readonly TextSpan CharacterSpan;
public int EdgeCount;
public Dictionary<int, int> SpilloverEdges;
public BuilderNode(char[] lowerCaseCharacters) : this()
public BuilderNode(TextSpan characterSpan) : this()
{
this.LowerCaseCharacters = lowerCaseCharacters;
this.CharacterSpan = characterSpan;
}
}
}
......
......@@ -3,6 +3,7 @@
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis.Text;
namespace Roslyn.Utilities
{
......@@ -12,7 +13,7 @@ private struct Node
{
// The string this node corresponds to. Stored in char[] format so we can easily compute
// edit distances on it.
public readonly char[] LowerCaseCharacters;
public readonly TextSpan CharacterSpan;
// How many children/edges this node has.
public readonly int EdgeCount;
......@@ -21,23 +22,24 @@ private struct Node
// _edges[FirstEdgeIndex, FirstEdgeIndex + EdgeCount)
public readonly int FirstEdgeIndex;
public Node(char[] lowerCaseCharacters, int edgeCount, int firstEdgeIndex)
public Node(TextSpan characterSpan, int edgeCount, int firstEdgeIndex)
{
LowerCaseCharacters = lowerCaseCharacters;
CharacterSpan = characterSpan;
EdgeCount = edgeCount;
FirstEdgeIndex = firstEdgeIndex;
}
internal void WriteTo(ObjectWriter writer)
{
writer.WriteValue(LowerCaseCharacters);
writer.WriteInt32(CharacterSpan.Start);
writer.WriteInt32(CharacterSpan.Length);
writer.WriteInt32(EdgeCount);
writer.WriteInt32(FirstEdgeIndex);
}
internal static Node ReadFrom(ObjectReader reader)
{
return new Node((char[])reader.ReadValue(), reader.ReadInt32(), reader.ReadInt32());
return new Node(new TextSpan(reader.ReadInt32(), reader.ReadInt32()), reader.ReadInt32(), reader.ReadInt32());
}
}
}
......
......@@ -10,6 +10,8 @@ internal partial class BKTree
{
internal void WriteTo(ObjectWriter writer)
{
writer.WriteValue(_allLowerCaseCharacters);
writer.WriteInt32(this._nodes.Length);
foreach (var node in _nodes)
{
......@@ -25,6 +27,7 @@ internal void WriteTo(ObjectWriter writer)
internal static BKTree ReadFrom(ObjectReader reader)
{
var allLowerCaseCharacters = (char[])reader.ReadValue();
var nodes = new Node[reader.ReadInt32()];
for (var i = 0; i < nodes.Length; i++)
{
......@@ -37,7 +40,7 @@ internal static BKTree ReadFrom(ObjectReader reader)
edges[i] = Edge.ReadFrom(reader);
}
return new BKTree(nodes, edges);
return new BKTree(allLowerCaseCharacters, nodes, edges);
}
}
}
......@@ -13,6 +13,7 @@ namespace Roslyn.Utilities
internal partial class BKTree
{
public static readonly BKTree Empty = new BKTree(
SpecializedCollections.EmptyArray<char>(),
SpecializedCollections.EmptyArray<Node>(),
SpecializedCollections.EmptyArray<Edge>());
......@@ -28,12 +29,13 @@ internal partial class BKTree
// * of course '0' is only for the root case. All nodes state where in _edges
// their child edges range starts. So the children for any node are in _edges from
// [node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount)
private readonly char[] _allLowerCaseCharacters;
private readonly Node[] _nodes;
private readonly Edge[] _edges;
private BKTree(Node[] nodes, Edge[] edges)
private BKTree(char[] allLowerCaseCharacters, Node[] nodes, Edge[] edges)
{
_allLowerCaseCharacters = allLowerCaseCharacters;
_nodes = nodes;
_edges = edges;
}
......@@ -79,13 +81,15 @@ private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, i
// We always want to compute the real edit distance (ignoring any thresholds). This is
// because we need that edit distance to appropriately determine which edges to walk
// in the tree.
var characterSpan = currentNode.CharacterSpan;
var editDistance = EditDistance.GetEditDistance(
new ArraySlice<char>(currentNode.LowerCaseCharacters), new ArraySlice<char>(queryCharacters, 0, queryLength));
new ArraySlice<char>(_allLowerCaseCharacters, characterSpan),
new ArraySlice<char>(queryCharacters, 0, queryLength));
if (editDistance <= threshold)
{
// Found a match.
result.Add(new string(currentNode.LowerCaseCharacters));
result.Add(new string(_allLowerCaseCharacters, characterSpan.Start, characterSpan.Length));
}
var min = editDistance - threshold;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册