BKTree.cs 6.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Roslyn.Utilities;
using static System.Math;
using static Roslyn.Utilities.PortableShim;

namespace Roslyn.Utilities
{
13 14 15
    /// <summary>
    /// NOTE: do not use this class directly.  It is intended for use only by the spell checker.
    /// </summary>
16
    internal partial class BKTree
17
    {
18
        public static readonly BKTree Empty = new BKTree(
19
            SpecializedCollections.EmptyArray<char>(),
20 21 22
            SpecializedCollections.EmptyArray<Node>(),
            SpecializedCollections.EmptyArray<Edge>());

C
Cyrus Najmabadi 已提交
23 24
        // We have three completely flat arrays of structs.  These arrays fully represent the 
        // BK tree.  The structure is as follows:
25
        //
C
Cyrus Najmabadi 已提交
26
        // The root node is in _nodes[0].
27
        //
C
Cyrus Najmabadi 已提交
28 29 30
        // It lists the count of edges it has.  These edges are in _edges in the range 
        // [0*, childCount).  Each edge has the index of the child node it points to, and the
        // edit distance between the parent and the child.
31
        //
C
Cyrus Najmabadi 已提交
32
        // * of course '0' is only for the root case.  All nodes state where in _edges
C
Cyrus Najmabadi 已提交
33
        // their child edges range starts.  So the children for any node are in _edges from
C
Cyrus Najmabadi 已提交
34
        // [node.FirstEdgeIndex, node.FirstEdgeIndex + node.EdgeCount)
C
Cyrus Najmabadi 已提交
35 36 37 38
        //
        // Each node also has an associated string.  These strings are concatenated and stored
        // in _allLowerCaseCharacters.  Each node has a TextSpan that indicates which portion
        // of the character array is their string.
39
        private readonly char[] _allLowerCaseCharacters;
C
Cyrus Najmabadi 已提交
40 41
        private readonly Node[] _nodes;
        private readonly Edge[] _edges;
42

43
        private BKTree(char[] allLowerCaseCharacters, Node[] nodes, Edge[] edges)
44
        {
45
            _allLowerCaseCharacters = allLowerCaseCharacters;
C
Cyrus Najmabadi 已提交
46 47
            _nodes = nodes;
            _edges = edges;
48 49
        }

50 51 52 53 54 55 56
        public static BKTree Create(params string[] values)
        {
            return Create((IEnumerable<string>)values);
        }

        public static BKTree Create(IEnumerable<string> values)
        {
57
            return new Builder(values).Create();
58 59
        }

60
        public IList<string> Find(string value, int? threshold = null)
61
        {
C
Cyrus Najmabadi 已提交
62
            if (_nodes.Length == 0)
63
            {
C
Cyrus Najmabadi 已提交
64
                return SpecializedCollections.EmptyList<string>();
65 66 67 68 69 70 71 72 73 74
            }

            var lowerCaseCharacters = ArrayPool<char>.GetArray(value.Length);
            try
            {
                for (var i = 0; i < value.Length; i++)
                {
                    lowerCaseCharacters[i] = char.ToLower(value[i]);
                }

75
                threshold = threshold ?? WordSimilarityChecker.GetThreshold(value);
76
                var result = new List<string>();
C
Cyrus Najmabadi 已提交
77
                Lookup(_nodes[0], lowerCaseCharacters, value.Length, threshold.Value, result);
78 79 80 81 82 83 84 85 86 87
                return result;
            }
            finally
            {
                ArrayPool<char>.ReleaseArray(lowerCaseCharacters);
            }
        }

        private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, int threshold, List<string> result)
        {
88 89 90
            // We always want to compute the real edit distance (ignoring any thresholds).  This is
            // because we need that edit distance to appropriately determine which edges to walk 
            // in the tree.
91
            var characterSpan = currentNode.CharacterSpan;
92
            var editDistance = EditDistance.GetEditDistance(
93
                new ArraySlice<char>(_allLowerCaseCharacters, characterSpan),
94
                new ArraySlice<char>(queryCharacters, 0, queryLength));
95 96 97 98

            if (editDistance <= threshold)
            {
                // Found a match.
99
                result.Add(new string(_allLowerCaseCharacters, characterSpan.Start, characterSpan.Length));
100 101 102 103 104
            }

            var min = editDistance - threshold;
            var max = editDistance + threshold;

C
Cyrus Najmabadi 已提交
105 106 107
            var firstEdgeIndex = currentNode.FirstEdgeIndex;
            var lastEdgeIndex = firstEdgeIndex + currentNode.EdgeCount;
            for (var i = firstEdgeIndex; i < lastEdgeIndex; i++)
108
            {
C
Cyrus Najmabadi 已提交
109
                var childEditDistance = _edges[i].EditDistance;
110
                if (min <= childEditDistance && childEditDistance <= max)
111
                {
C
Cyrus Najmabadi 已提交
112
                    Lookup(this._nodes[_edges[i].ChildNodeIndex],
113
                        queryCharacters, queryLength, threshold, result);
114 115 116 117 118 119 120
                }
            }
        }

        internal void DumpStats()
        {
            var sb = new StringBuilder();
C
Cyrus Najmabadi 已提交
121
            sb.AppendLine("Nodes length: " + _nodes.Length);
122 123
            var childCountHistogram = new Dictionary<int, int>();

C
Cyrus Najmabadi 已提交
124
            foreach (var node in _nodes)
125
            {
C
Cyrus Najmabadi 已提交
126
                var childCount = node.EdgeCount;
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
                int existing;
                childCountHistogram.TryGetValue(childCount, out existing);

                childCountHistogram[childCount] = existing + 1;
            }

            sb.AppendLine();
            sb.AppendLine("Child counts:");
            foreach (var kvp in childCountHistogram.OrderBy(kvp => kvp.Key))
            {
                sb.AppendLine(kvp.Key + "\t" + kvp.Value);
            }

            // An item is dense if, starting from 1, at least 80% of it's array would be full.
            var densities = new int[11];
            var empyCount = 0;

C
Cyrus Najmabadi 已提交
144
            foreach (var node in _nodes)
145
            {
C
Cyrus Najmabadi 已提交
146
                if (node.EdgeCount == 0)
147 148 149 150 151
                {
                    empyCount++;
                    continue;
                }

152
                var maxEditDistance = -1;
C
Cyrus Najmabadi 已提交
153 154
                var firstChild = node.FirstEdgeIndex;
                var lastChild = firstChild + node.EdgeCount;
155 156
                for (var i = firstChild; i < lastChild; i++)
                {
C
Cyrus Najmabadi 已提交
157
                    maxEditDistance = Max(maxEditDistance, _edges[i].EditDistance);
158 159
                }

C
Cyrus Najmabadi 已提交
160
                var editDistanceCount = node.EdgeCount;
161 162 163 164 165

                var bucket = 10 * editDistanceCount / maxEditDistance;
                densities[bucket]++;
            }

C
Cyrus Najmabadi 已提交
166
            var nonEmptyCount = _nodes.Length - empyCount;
167 168 169 170 171 172 173 174 175 176 177 178 179
            sb.AppendLine();
            sb.AppendLine("NoChildren: " + empyCount);
            sb.AppendLine("AnyChildren: " + nonEmptyCount);
            sb.AppendLine("Densities:");
            for (var i = 0; i < densities.Length; i++)
            {
                sb.AppendLine("<=" + i + "0% = " + densities[i] + ", " + ((float)densities[i] / nonEmptyCount));
            }

            var result = sb.ToString();
        }
    }
}