using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using Roslyn.Utilities; using static System.Math; using static Roslyn.Utilities.PortableShim; namespace Roslyn.Utilities { /// /// NOTE: do not use this class directly. It is intended for use only by the spell checker. /// internal partial class BKTree { public static readonly BKTree Empty = new BKTree( SpecializedCollections.EmptyArray(), SpecializedCollections.EmptyArray(), SpecializedCollections.EmptyArray()); // We have three completely flat arrays of structs. These arrays fully represent the // BK tree. The structure is as follows: // // The root node is in _nodes[0]. // // It lists the count of edges it has. These edges are in _edges in the range // [0*, childCount). Each edge has the index of the child node it points to, and the // edit distance between the parent and the child. // // * of course '0' is only for the root case. All nodes state where in _edges // their child edges range starts. So the children for any node are in _edges from // [node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount) // // Each node also has an associated string. These strings are concatenated and stored // in _allLowerCaseCharacters. Each node has a TextSpan that indicates which portion // of the character array is their string. private readonly char[] _allLowerCaseCharacters; private readonly Node[] _nodes; private readonly Edge[] _edges; private BKTree(char[] allLowerCaseCharacters, Node[] nodes, Edge[] edges) { _allLowerCaseCharacters = allLowerCaseCharacters; _nodes = nodes; _edges = edges; } public static BKTree Create(params string[] values) { return Create((IEnumerable)values); } public static BKTree Create(IEnumerable values) { return new Builder(values).Create(); } public IList Find(string value, int? threshold = null) { if (_nodes.Length == 0) { return SpecializedCollections.EmptyList(); } var lowerCaseCharacters = ArrayPool.GetArray(value.Length); try { for (var i = 0; i < value.Length; i++) { lowerCaseCharacters[i] = char.ToLower(value[i]); } threshold = threshold ?? WordSimilarityChecker.GetThreshold(value); var result = new List(); Lookup(_nodes[0], lowerCaseCharacters, value.Length, threshold.Value, result); return result; } finally { ArrayPool.ReleaseArray(lowerCaseCharacters); } } private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, int threshold, List result) { // We always want to compute the real edit distance (ignoring any thresholds). This is // because we need that edit distance to appropriately determine which edges to walk // in the tree. var characterSpan = currentNode.CharacterSpan; var editDistance = EditDistance.GetEditDistance( new ArraySlice(_allLowerCaseCharacters, characterSpan), new ArraySlice(queryCharacters, 0, queryLength)); if (editDistance <= threshold) { // Found a match. result.Add(new string(_allLowerCaseCharacters, characterSpan.Start, characterSpan.Length)); } var min = editDistance - threshold; var max = editDistance + threshold; var firstEdgeIndex = currentNode.FirstEdgeIndex; var lastEdgeIndex = firstEdgeIndex + currentNode.EdgeCount; for (var i = firstEdgeIndex; i < lastEdgeIndex; i++) { var childEditDistance = _edges[i].EditDistance; if (min <= childEditDistance && childEditDistance <= max) { Lookup(this._nodes[_edges[i].ChildNodeIndex], queryCharacters, queryLength, threshold, result); } } } internal void DumpStats() { var sb = new StringBuilder(); sb.AppendLine("Nodes length: " + _nodes.Length); var childCountHistogram = new Dictionary(); foreach (var node in _nodes) { var childCount = node.EdgeCount; int existing; childCountHistogram.TryGetValue(childCount, out existing); childCountHistogram[childCount] = existing + 1; } sb.AppendLine(); sb.AppendLine("Child counts:"); foreach (var kvp in childCountHistogram.OrderBy(kvp => kvp.Key)) { sb.AppendLine(kvp.Key + "\t" + kvp.Value); } // An item is dense if, starting from 1, at least 80% of it's array would be full. var densities = new int[11]; var empyCount = 0; foreach (var node in _nodes) { if (node.EdgeCount == 0) { empyCount++; continue; } var maxEditDistance = -1; var firstChild = node.FirstEdgeIndex; var lastChild = firstChild + node.EdgeCount; for (var i = firstChild; i < lastChild; i++) { maxEditDistance = Max(maxEditDistance, _edges[i].EditDistance); } var editDistanceCount = node.EdgeCount; var bucket = 10 * editDistanceCount / maxEditDistance; densities[bucket]++; } var nonEmptyCount = _nodes.Length - empyCount; sb.AppendLine(); sb.AppendLine("NoChildren: " + empyCount); sb.AppendLine("AnyChildren: " + nonEmptyCount); sb.AppendLine("Densities:"); for (var i = 0; i < densities.Length; i++) { sb.AppendLine("<=" + i + "0% = " + densities[i] + ", " + ((float)densities[i] / nonEmptyCount)); } var result = sb.ToString(); } } }