提交 9127c8b2 编写于 作者: D Danila Kutenin

inverting ngramSearch to be more intuitive

上级 040f4279
...@@ -271,11 +271,17 @@ struct NgramDistanceImpl ...@@ -271,11 +271,17 @@ struct NgramDistanceImpl
{ {
size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr); size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
/// For !Symmetric version we should not use first_size. /// For !Symmetric version we should not use first_size.
res = distance * 1.f / std::max(Symmetric * first_size + second_size, size_t(1)); if constexpr (Symmetric)
res = distance * 1.f / std::max(first_size + second_size, size_t(1));
else
res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
} }
else else
{ {
if constexpr (Symmetric)
res = 1.f; res = 1.f;
else
res = 0.f;
} }
} }
...@@ -333,13 +339,19 @@ struct NgramDistanceImpl ...@@ -333,13 +339,19 @@ struct NgramDistanceImpl
/// For !Symmetric version we should not use haystack_stats_size. /// For !Symmetric version we should not use haystack_stats_size.
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1)); if constexpr (Symmetric)
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
else
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
} }
else else
{ {
/// Strings are too big, we are assuming they are not the same. This is done because of limiting number /// Strings are too big, we are assuming they are not the same. This is done because of limiting number
/// of bigrams added and not allocating too much memory. /// of bigrams added and not allocating too much memory.
if constexpr (Symmetric)
res[i] = 1.f; res[i] = 1.f;
else
res[i] = 0.f;
} }
prev_needle_offset = needle_offsets[i]; prev_needle_offset = needle_offsets[i];
...@@ -399,11 +411,11 @@ struct NgramDistanceImpl ...@@ -399,11 +411,11 @@ struct NgramDistanceImpl
for (size_t j = 0; j < needle_stats_size; ++j) for (size_t j = 0; j < needle_stats_size; ++j)
--common_stats[needle_ngram_storage[j]]; --common_stats[needle_ngram_storage[j]];
res[i] = distance * 1.f / std::max(needle_stats_size, size_t(1)); res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
} }
else else
{ {
res[i] = 1.f; res[i] = 0.f;
} }
prev_offset = needle_offsets[i]; prev_offset = needle_offsets[i];
...@@ -446,12 +458,18 @@ struct NgramDistanceImpl ...@@ -446,12 +458,18 @@ struct NgramDistanceImpl
distance, distance,
ngram_storage.get()); ngram_storage.get());
/// For !Symmetric version we should not use haystack_stats_size. /// For !Symmetric version we should not use haystack_stats_size.
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1)); if constexpr (Symmetric)
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
else
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
} }
else else
{ {
/// if the strings are too big, we say they are completely not the same /// if the strings are too big, we say they are completely not the same
if constexpr (Symmetric)
res[i] = 1.f; res[i] = 1.f;
else
res[i] = 0.f;
} }
distance = needle_stats_size; distance = needle_stats_size;
prev_offset = offsets[i]; prev_offset = offsets[i];
......
...@@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC ...@@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC
## ngramSearch(haystack, needle) ## ngramSearch(haystack, needle)
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. Can be useful for fuzzy string search. Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search.
For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.
......
...@@ -97,7 +97,7 @@ ...@@ -97,7 +97,7 @@
## ngramSearch(haystack, needle) ## ngramSearch(haystack, needle)
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Может быть использовано для приближенного поиска. То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска.
Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册