提交 4566e648 编写于 作者: A Avi Avni 提交者: Phillip Carter

remove allocations from jaro (#6050)

* remove allocations from jaro

* improve perforamce

* fix build

* remove string concat allocations from FilterPredictions

* fix build

* move to struct tuple and remove the concat completely

* undo
上级 9b55eccd
......@@ -43,6 +43,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) =
name |> Seq.forall (fun c -> c <> ' ')
if allSuggestions.Contains idText then [] else // some other parsing error occurred
let dotIdText = "." + idText
allSuggestions
|> Seq.choose (fun suggestion ->
// Because beginning a name with _ is used both to indicate an unused
......@@ -53,7 +54,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) =
let suggestion:string = demangle suggestion
let suggestedText = suggestion.ToUpperInvariant()
let similarity = EditDistance.JaroWinklerDistance uppercaseText suggestedText
if similarity >= highConfidenceThreshold || suggestion.EndsWithOrdinal("." + idText) then
if similarity >= highConfidenceThreshold || suggestion.EndsWithOrdinal(dotIdText) then
Some(similarity, suggestion)
elif similarity < minThresholdForSuggestions && suggestedText.Length > minStringLengthForThreshold then
None
......
......@@ -23,33 +23,45 @@ let jaro (s1: string) (s2: string) =
let matchRadius =
let minLen = Math.Min(s1.Length, s2.Length)
minLen / 2 + minLen % 2
// An inner function which recursively finds the number
// of matched characters within the radius.
let commonChars (chars1: string) (chars2: string) =
let result = ResizeArray(chars1.Length)
for i = 0 to chars1.Length - 1 do
let c = chars1.[i]
if existsInWin c chars2 i matchRadius then
result.Add c
result
// The sets of common characters and their lengths as floats
let c1 = commonChars s1 s2
let c2 = commonChars s2 s1
let c1length = float c1.Count
let c2length = float c2.Count
let rec nextChar (s1:string) (s2:string) i c =
if i < s1.Length then
let c = s1.[i]
if not (existsInWin c s2 i matchRadius) then
nextChar s1 s2 (i + 1) c
else
struct (i, c)
else
struct (i, c)
// The sets of common characters and their lengths as floats
// The number of transpositions within the sets of common characters.
let transpositions =
let mutable mismatches = 0.0
for i = 0 to (Math.Min(c1.Count, c2.Count)) - 1 do
if c1.[i] <> c2.[i] then
mismatches <- mismatches + 1.0
// If one common string is longer than the other
// each additional char counts as half a transposition
(mismatches + abs (c1length - c2length)) / 2.0
let struct (transpositions, c1length, c2length) =
let rec loop i j mismatches c1length c2length =
if i < s1.Length && j < s2.Length then
let struct (ti, ci) = nextChar s1 s2 i ' '
let struct (tj, cj) = nextChar s2 s1 j ' '
if ci <> cj then
loop (ti + 1) (tj + 1) (mismatches + 1) (c1length + 1) (c2length + 1)
else
loop (ti + 1) (tj + 1) mismatches (c1length + 1) (c2length + 1)
else struct (i, j, mismatches, c1length, c2length)
let struct (i, j, mismatches, c1length, c2length) = loop 0 0 0 0 0
let rec loop (s1:string) (s2:string) i length =
if i < s1.Length - 1 then
let c = s1.[i]
if existsInWin c s2 i matchRadius then
loop s1 s2 (i + 1) (length + 1)
else
loop s1 s2 (i + 1) length
else
length
let c1length = loop s1 s2 i c1length |> float
let c2length = loop s2 s1 j c2length |> float
struct ((float mismatches + abs (c1length - c2length)) / 2.0, c1length, c2length)
let tLength = Math.Max(c1length, c2length)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册