提交 9127c8b2 编写于 作者: D Danila Kutenin

inverting ngramSearch to be more intuitive

上级 040f4279
......@@ -271,11 +271,17 @@ struct NgramDistanceImpl
{
size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
/// For !Symmetric version we should not use first_size.
res = distance * 1.f / std::max(Symmetric * first_size + second_size, size_t(1));
if constexpr (Symmetric)
res = distance * 1.f / std::max(first_size + second_size, size_t(1));
else
res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
}
else
{
res = 1.f;
if constexpr (Symmetric)
res = 1.f;
else
res = 0.f;
}
}
......@@ -333,13 +339,19 @@ struct NgramDistanceImpl
/// For !Symmetric version we should not use haystack_stats_size.
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
if constexpr (Symmetric)
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
else
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
}
else
{
/// Strings are too big, we are assuming they are not the same. This is done because of limiting number
/// of bigrams added and not allocating too much memory.
res[i] = 1.f;
if constexpr (Symmetric)
res[i] = 1.f;
else
res[i] = 0.f;
}
prev_needle_offset = needle_offsets[i];
......@@ -399,11 +411,11 @@ struct NgramDistanceImpl
for (size_t j = 0; j < needle_stats_size; ++j)
--common_stats[needle_ngram_storage[j]];
res[i] = distance * 1.f / std::max(needle_stats_size, size_t(1));
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
}
else
{
res[i] = 1.f;
res[i] = 0.f;
}
prev_offset = needle_offsets[i];
......@@ -446,12 +458,18 @@ struct NgramDistanceImpl
distance,
ngram_storage.get());
/// For !Symmetric version we should not use haystack_stats_size.
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
if constexpr (Symmetric)
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
else
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
}
else
{
/// if the strings are too big, we say they are completely not the same
res[i] = 1.f;
if constexpr (Symmetric)
res[i] = 1.f;
else
res[i] = 0.f;
}
distance = needle_stats_size;
prev_offset = offsets[i];
......
......@@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC
## ngramSearch(haystack, needle)
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. Can be useful for fuzzy string search.
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search.
For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.
......
......@@ -97,7 +97,7 @@
## ngramSearch(haystack, needle)
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Может быть использовано для приближенного поиска.
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска.
Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册