inverting ngramSearch to be more intuitive

9127c8b2 · Danila Kutenin · 040f4279 · 9127c8b2 · 9127c8b2 · 9127c8b2
5 changed file
--- a/dbms/src/Functions/FunctionsStringSimilarity.cpp
+++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp
@@ -271,11 +271,17 @@ struct NgramDistanceImpl
        {
            size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
            /// For !Symmetric version we should not use first_size.
-            res = distance * 1.f / std::max(Symmetric * first_size + second_size, size_t(1));
+            if constexpr (Symmetric)
+                res = distance * 1.f / std::max(first_size + second_size, size_t(1));
+            else
+                res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
        }
        else
        {
+            if constexpr (Symmetric)
                res = 1.f;
+            else
+                res = 0.f;
        }
    }
@@ -333,13 +339,19 @@ struct NgramDistanceImpl
                /// For !Symmetric version we should not use haystack_stats_size.
-                res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
+                if constexpr (Symmetric)
+                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
+                else
+                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
            }
            else
            {
                /// Strings are too big, we are assuming they are not the same. This is done because of limiting number
                /// of bigrams added and not allocating too much memory.
+                if constexpr (Symmetric)
                    res[i] = 1.f;
+                else
+                    res[i] = 0.f;
            }
            prev_needle_offset = needle_offsets[i];
@@ -399,11 +411,11 @@ struct NgramDistanceImpl
                    for (size_t j = 0; j < needle_stats_size; ++j)
                        --common_stats[needle_ngram_storage[j]];
-                    res[i] = distance * 1.f / std::max(needle_stats_size, size_t(1));
+                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
                }
                else
                {
-                    res[i] = 1.f;
+                    res[i] = 0.f;
                }
                prev_offset = needle_offsets[i];
@@ -446,12 +458,18 @@ struct NgramDistanceImpl
                    distance,
                    ngram_storage.get());
                /// For !Symmetric version we should not use haystack_stats_size.
-                res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
+                if constexpr (Symmetric)
+                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
+                else
+                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
            }
            else
            {
                /// if the strings are too big, we say they are completely not the same
+                if constexpr (Symmetric)
                    res[i] = 1.f;
+                else
+                    res[i] = 0.f;
            }
            distance = needle_stats_size;
            prev_offset = offsets[i];

--- a/dbms/tests/queries/0_stateless/00951_ngram_entry.reference
+++ b/dbms/tests/queries/0_stateless/00951_ngram_entry.reference
--- a/dbms/tests/queries/0_stateless/00951_ngram_entry.sql
+++ b/dbms/tests/queries/0_stateless/00951_ngram_entry.sql
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC
 ## ngramSearch(haystack, needle)
-Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. Can be useful for fuzzy string search.
+Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search.
 For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.

--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@@ -97,7 +97,7 @@
 ## ngramSearch(haystack, needle)
-То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Может быть использовано для приближенного поиска.
+То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска.
 Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.