提交 7d27b42b 编写于 作者: C Cyrus Najmabadi

PR feedback.

上级 1e8607df
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.Text;
......@@ -19,8 +16,18 @@ private class Builder
{
private const int CompactEdgeAllocationSize = 4;
private readonly char[] _allLowerCaseCharacters;
private readonly TextSpan[] _characterSpans;
// Instead of producing a char[] for each string we're building a node for, we instead
// have one long char[] with all the chracters of each string concatenated. i.e.
// "foo" "bar" and "baz" becomes { f, o, o, b, a, r, b, a, z }. Then in _wordSpans
// we have the text spans for each of those words in this array. This gives us only
// two allocations instead of as many allocations as the number of strings we have.
//
// Once we are done building, we pass this to the BKTree and its nodes also state the
// span of this array that corresponds to the word they were created for. This works
// well as other dependent facilities (like EditDistance) can work on sub-arrays without
// any problems.
private readonly char[] _concatenatedLowerCaseWords;
private readonly TextSpan[] _wordSpans;
// Note: while building a BKTree we have to store children with parents, keyed by the
// edit distance between the two. Naive implementations might store a list or dictionary
......@@ -83,18 +90,18 @@ public Builder(IEnumerable<string> values)
var distinctValues = values.Where(v => v.Length > 0).Distinct(CaseInsensitiveComparison.Comparer).ToArray();
var charCount = Enumerable.Sum<string>(values,(Func<string, int>)(v => (int)v.Length));
_allLowerCaseCharacters = new char[charCount];
_characterSpans = new TextSpan[distinctValues.Length];
_concatenatedLowerCaseWords = new char[charCount];
_wordSpans = new TextSpan[distinctValues.Length];
var characterIndex = 0;
for (int i = 0; i < distinctValues.Length; i++)
{
var value = distinctValues[i];
_characterSpans[i] = new TextSpan(characterIndex, value.Length);
_wordSpans[i] = new TextSpan(characterIndex, value.Length);
foreach (var ch in value)
{
_allLowerCaseCharacters[characterIndex] = char.ToLower(ch);
_concatenatedLowerCaseWords[characterIndex] = char.ToLower(ch);
characterIndex++;
}
}
......@@ -106,9 +113,9 @@ public Builder(IEnumerable<string> values)
internal BKTree Create()
{
for (var i = 0; i < _characterSpans.Length; i++)
for (var i = 0; i < _wordSpans.Length; i++)
{
Add(_characterSpans[i], insertionIndex: i);
Add(_wordSpans[i], insertionIndex: i);
}
var nodes = ImmutableArray.CreateBuilder<Node>(_builderNodes.Length);
......@@ -119,7 +126,7 @@ internal BKTree Create()
BuildArrays(nodes, edges);
return new BKTree(_allLowerCaseCharacters, nodes.MoveToImmutable(), edges.MoveToImmutable());
return new BKTree(_concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable());
}
private void BuildArrays(ImmutableArray<Node>.Builder nodes, ImmutableArray<Edge>.Builder edges)
......@@ -136,7 +143,7 @@ private void BuildArrays(ImmutableArray<Node>.Builder nodes, ImmutableArray<Edge
{
if (edgeCount <= CompactEdgeAllocationSize)
{
// When tehre are less than 4 elements, copy from teh _compact array.
// When there are less than 4 elements, copy from teh _compact array.
var start = i * CompactEdgeAllocationSize;
var end = start + edgeCount;
for (var j = start; j < end; j++)
......@@ -182,8 +189,8 @@ private void Add(TextSpan characterSpan, int insertionIndex)
// a threshold here as we need the actual edit distance so we can actually
// determine what edge to make or walk.
var editDistance = EditDistance.GetEditDistance(
new ArraySlice<char>(_allLowerCaseCharacters, currentNode.CharacterSpan),
new ArraySlice<char>(_allLowerCaseCharacters, characterSpan));
new ArraySlice<char>(_concatenatedLowerCaseWords, currentNode.CharacterSpan),
new ArraySlice<char>(_concatenatedLowerCaseWords, characterSpan));
if (editDistance == 0)
{
......
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Roslyn.Utilities
{
internal partial class BKTree
......
......@@ -9,8 +9,8 @@ internal partial class BKTree
private struct Node
{
// The string this node corresponds to. Specifically, this span is the range of
// _allLowerCaseCharacters for that string.
public readonly TextSpan CharacterSpan;
// _concatenatedLowerCaseWords for that string.
public readonly TextSpan WordSpan;
// How many children/edges this node has.
public readonly int EdgeCount;
......@@ -19,17 +19,17 @@ private struct Node
// _edges[FirstEdgeIndex, FirstEdgeIndex + EdgeCount)
public readonly int FirstEdgeIndex;
public Node(TextSpan characterSpan, int edgeCount, int firstEdgeIndex)
public Node(TextSpan wordSpan, int edgeCount, int firstEdgeIndex)
{
CharacterSpan = characterSpan;
WordSpan = wordSpan;
EdgeCount = edgeCount;
FirstEdgeIndex = firstEdgeIndex;
}
internal void WriteTo(ObjectWriter writer)
{
writer.WriteInt32(CharacterSpan.Start);
writer.WriteInt32(CharacterSpan.Length);
writer.WriteInt32(WordSpan.Start);
writer.WriteInt32(WordSpan.Length);
writer.WriteInt32(EdgeCount);
writer.WriteInt32(FirstEdgeIndex);
}
......
......@@ -8,8 +8,8 @@ internal partial class BKTree
{
internal void WriteTo(ObjectWriter writer)
{
writer.WriteInt32(_allLowerCaseCharacters.Length);
foreach (var c in _allLowerCaseCharacters)
writer.WriteInt32(_concatenatedLowerCaseWords.Length);
foreach (var c in _concatenatedLowerCaseWords)
{
writer.WriteChar(c);
}
......@@ -29,10 +29,10 @@ internal void WriteTo(ObjectWriter writer)
internal static BKTree ReadFrom(ObjectReader reader)
{
var allLowerCaseCharacters = new char[reader.ReadInt32()];
for (var i = 0; i < allLowerCaseCharacters.Length; i++)
var concatenatedLowerCaseWords = new char[reader.ReadInt32()];
for (var i = 0; i < concatenatedLowerCaseWords.Length; i++)
{
allLowerCaseCharacters[i] = reader.ReadChar();
concatenatedLowerCaseWords[i] = reader.ReadChar();
}
var nodeCount = reader.ReadInt32();
......@@ -49,7 +49,7 @@ internal static BKTree ReadFrom(ObjectReader reader)
edges.Add(Edge.ReadFrom(reader));
}
return new BKTree(allLowerCaseCharacters, nodes.MoveToImmutable(), edges.MoveToImmutable());
return new BKTree(concatenatedLowerCaseWords, nodes.MoveToImmutable(), edges.MoveToImmutable());
}
}
}
......@@ -33,19 +33,19 @@ internal partial class BKTree
// [node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount)
//
// Each node also has an associated string. These strings are concatenated and stored
// in _allLowerCaseCharacters. Each node has a TextSpan that indicates which portion
// in _concatenatedLowerCaseWords. Each node has a TextSpan that indicates which portion
// of the character array is their string. Note: i'd like to use an immutable array
// for the characters as well. However, we need to create slices, and they need to
// work on top of an ArraySlice (which needs a char[]). The edit distance code also
// wants to work on top of raw char[]s (both for speed, and so it can pool arrays
// to prevent lots of garbage). Because of that we just keep this as a char[].
private readonly char[] _allLowerCaseCharacters;
private readonly char[] _concatenatedLowerCaseWords;
private readonly ImmutableArray<Node> _nodes;
private readonly ImmutableArray<Edge> _edges;
private BKTree(char[] allLowerCaseCharacters, ImmutableArray<Node> nodes, ImmutableArray<Edge> edges)
private BKTree(char[] concatenatedLowerCaseWords, ImmutableArray<Node> nodes, ImmutableArray<Edge> edges)
{
_allLowerCaseCharacters = allLowerCaseCharacters;
_concatenatedLowerCaseWords = concatenatedLowerCaseWords;
_nodes = nodes;
_edges = edges;
}
......@@ -91,15 +91,15 @@ private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, i
// We always want to compute the real edit distance (ignoring any thresholds). This is
// because we need that edit distance to appropriately determine which edges to walk
// in the tree.
var characterSpan = currentNode.CharacterSpan;
var characterSpan = currentNode.WordSpan;
var editDistance = EditDistance.GetEditDistance(
new ArraySlice<char>(_allLowerCaseCharacters, characterSpan),
new ArraySlice<char>(_concatenatedLowerCaseWords, characterSpan),
new ArraySlice<char>(queryCharacters, 0, queryLength));
if (editDistance <= threshold)
{
// Found a match.
result.Add(new string(_allLowerCaseCharacters, characterSpan.Start, characterSpan.Length));
result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length));
}
var min = editDistance - threshold;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册