From 4fd85b41367659e2350db9e9c1b936dd1dc3c33e Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 12 Jun 2019 05:35:25 +0300 Subject: [PATCH] Clearer interfaces of Searchers --- dbms/src/Common/StringSearcher.h | 24 ++- dbms/src/Common/Volnitsky.h | 161 +++++-------------- dbms/src/Functions/FunctionsStringSearch.cpp | 63 +++++++- 3 files changed, 111 insertions(+), 137 deletions(-) diff --git a/dbms/src/Common/StringSearcher.h b/dbms/src/Common/StringSearcher.h index 9e1f241343..f722ebc6c5 100644 --- a/dbms/src/Common/StringSearcher.h +++ b/dbms/src/Common/StringSearcher.h @@ -329,8 +329,7 @@ class StringSearcher : private StringSearcherBase private: /// string to be searched for const UInt8 * const needle; - const size_t needle_size; - const UInt8 * const needle_end = needle + needle_size; + const UInt8 * const needle_end; /// lower and uppercase variants of the first character in `needle` UInt8 l{}; UInt8 u{}; @@ -345,7 +344,7 @@ private: public: StringSearcher(const char * const needle_, const size_t needle_size) - : needle{reinterpret_cast(needle_)}, needle_size{needle_size} + : needle{reinterpret_cast(needle_)}, needle_end{needle + needle_size} { if (0 == needle_size) return; @@ -430,7 +429,7 @@ public: const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const { - if (0 == needle_size) + if (needle == needle_end) return haystack; while (haystack < haystack_end) @@ -528,8 +527,7 @@ class StringSearcher : private StringSearcherBase private: /// string to be searched for const UInt8 * const needle; - const size_t needle_size; - const UInt8 * const needle_end = needle + needle_size; + const UInt8 * const needle_end; /// first character in `needle` UInt8 first{}; @@ -543,7 +541,7 @@ private: public: StringSearcher(const char * const needle_, const size_t needle_size) - : needle{reinterpret_cast(needle_)}, needle_size{needle_size} + : needle{reinterpret_cast(needle_)}, needle_end{needle + needle_size} { if (0 == needle_size) return; @@ -616,7 +614,7 @@ public: const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const { - if (0 == needle_size) + if (needle == needle_end) return haystack; while (haystack < haystack_end) @@ -715,10 +713,9 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher; struct LibCASCIICaseSensitiveStringSearcher { const char * const needle; - const size_t needle_size; - LibCASCIICaseSensitiveStringSearcher(const char * const needle, const size_t needle_size) - : needle(needle), needle_size(needle_size) {} + LibCASCIICaseSensitiveStringSearcher(const char * const needle, const size_t /* needle_size */) + : needle(needle) {} const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const { @@ -737,10 +734,9 @@ struct LibCASCIICaseSensitiveStringSearcher struct LibCASCIICaseInsensitiveStringSearcher { const char * const needle; - const size_t needle_size; - LibCASCIICaseInsensitiveStringSearcher(const char * const needle, const size_t needle_size) - : needle(needle), needle_size(needle_size) {} + LibCASCIICaseInsensitiveStringSearcher(const char * const needle, const size_t /* needle_size */) + : needle(needle) {} const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const { diff --git a/dbms/src/Common/Volnitsky.h b/dbms/src/Common/Volnitsky.h index bce37e655c..646ad57aa1 100644 --- a/dbms/src/Common/Volnitsky.h +++ b/dbms/src/Common/Volnitsky.h @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -345,6 +344,7 @@ public: auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); }; /// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0 + /// And also adding from the end guarantees that we will find first occurence because we will lookup bigger offsets first. for (auto i = static_cast(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i) VolnitskyTraits::putNGram(this->needle + i, i + 1, this->needle, callback); } @@ -436,94 +436,6 @@ public: fallback_searchers.reserve(needles.size()); } - template - void searchAllPositions( - const ColumnString::Chars & haystack_data, - const ColumnString::Offsets & haystack_offsets, - const AnsCallback & ans_callback, - ResultType & ans) - { - const size_t haystack_string_size = haystack_offsets.size(); - const size_t needles_size = needles.size(); - - /// something can be uninitialized after - std::fill(ans.begin(), ans.end(), 0); - - while (!reset()) - { - size_t fallback_size = fallback_needles.size(); - size_t prev_offset = 0; - for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size) - { - const auto * haystack = &haystack_data[prev_offset]; - const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; - for (size_t i = 0; i < fallback_size; ++i) - { - const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); - if (ptr != haystack_end) - ans[from + fallback_needles[i]] = ans_callback(haystack, ptr); - } - - /// check if we have one non empty volnitsky searcher - if (step != std::numeric_limits::max()) - { - const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); - for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) - { - for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; - cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) - { - if (pos >= haystack + hash[cell_num].off - 1) - { - const auto * res = pos - (hash[cell_num].off - 1); - const size_t ind = hash[cell_num].id; - if (ans[from + ind] == 0 && res + needles[ind].size <= haystack_end) - { - if (fallback_searchers[ind].compare(res)) - { - ans[from + ind] = ans_callback(haystack, res); - } - } - } - } - } - } - prev_offset = haystack_offsets[j]; - } - } - } - - template - void search(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans) - { - auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> bool - { - return this->searchOne(haystack, haystack_end); - }; - searchInternal(haystack_data, haystack_offsets, callback, ans); - } - - template - void searchIndex(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans) - { - auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t - { - return this->searchOneIndex(haystack, haystack_end); - }; - searchInternal(haystack_data, haystack_offsets, callback, ans); - } - - template - void searchFirstPosition(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const CountCharsCallback & count_chars_callback, ResultType & ans) - { - auto callback = [this, &count_chars_callback](const UInt8 * haystack, const UInt8 * haystack_end) -> UInt64 - { - return this->searchOneFirstPosition(haystack, haystack_end, count_chars_callback); - }; - searchInternal(haystack_data, haystack_offsets, callback, ans); - } - -private: /** * This function is needed to initialize hash table * Returns `true` if there is nothing to initialize @@ -532,15 +444,15 @@ private: * We actually destroy the hash table and initialize it with uninitialized needles * and search through the haystack again. * The actual usage of this function is like this: - * while (!reset()) + * while (hasMoreToSearch()) * { * search inside the haystack with the known needles * } */ - bool reset() + bool hasMoreToSearch() { if (last == needles.size()) - return true; + return false; memset(hash, 0, sizeof(hash)); fallback_needles.clear(); @@ -585,28 +497,7 @@ private: } fallback_searchers.emplace_back(cur_needle_data, cur_needle_size); } - return false; - } - - template - inline void searchInternal( - const ColumnString::Chars & haystack_data, - const ColumnString::Offsets & haystack_offsets, - const OneSearcher & search_fallback, - ResultType & ans) - { - const size_t haystack_string_size = haystack_offsets.size(); - while (!reset()) - { - size_t prev_offset = 0; - for (size_t j = 0; j < haystack_string_size; ++j) - { - const auto * haystack = &haystack_data[prev_offset]; - const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; - ans[j] = search_fallback(haystack, haystack_end); - prev_offset = haystack_offsets[j]; - } - } + return true; } inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const @@ -638,7 +529,7 @@ private: return false; } - inline size_t searchOneIndex(const UInt8 * haystack, const UInt8 * haystack_end) const + inline size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const { const size_t fallback_size = fallback_needles.size(); @@ -676,7 +567,7 @@ private: } template - inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & callback) const + inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const { const size_t fallback_size = fallback_needles.size(); @@ -684,7 +575,7 @@ private: for (size_t i = 0; i < fallback_size; ++i) if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end) - ans = std::min(ans, callback(haystack, pos)); + ans = std::min(ans, pos - haystack); /// check if we have one non empty volnitsky searcher if (step != std::numeric_limits::max()) @@ -700,14 +591,46 @@ private: const auto res = pos - (hash[cell_num].off - 1); const size_t ind = hash[cell_num].id; if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res)) - ans = std::min(ans, callback(haystack, res)); + ans = std::min(ans, res - haystack); } } } } if (ans == std::numeric_limits::max()) return 0; - return ans; + return count_chars(haystack, haystack + ans); + } + + template + inline void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * ans, const CountCharsCallback & count_chars) const + { + const size_t fallback_size = fallback_needles.size(); + for (size_t i = 0; i < fallback_size; ++i) + { + const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); + if (ptr != haystack_end) + ans[fallback_needles[i]] = count_chars(haystack, ptr); + } + + /// check if we have one non empty volnitsky searcher + if (step != std::numeric_limits::max()) + { + const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram); + for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step) + { + for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off; + cell_num = (cell_num + 1) % VolnitskyTraits::hash_size) + { + if (pos >= haystack + hash[cell_num].off - 1) + { + const auto * res = pos - (hash[cell_num].off - 1); + const size_t ind = hash[cell_num].id; + if (ans[ind] == 0 && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res)) + ans[ind] = count_chars(haystack, res); + } + } + } + } } void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num) diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 24ec314d98..269fa60b32 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -307,7 +307,26 @@ struct MultiSearchAllPositionsImpl { return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); }; - Impl::createMultiSearcherInBigHaystack(needles).searchAllPositions(haystack_data, haystack_offsets, res_callback, res); + + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); + + const size_t haystack_string_size = haystack_offsets.size(); + const size_t needles_size = needles.size(); + + /// Something can be uninitialized after the search itself + std::fill(res.begin(), res.end(), 0); + + while (searcher.hasMoreToSearch()) + { + size_t prev_offset = 0; + for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size) + { + const auto * haystack = &haystack_data[prev_offset]; + const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; + searcher.searchOneAll(haystack, haystack_end, res.data() + from, res_callback); + prev_offset = haystack_offsets[j]; + } + } } }; @@ -323,7 +342,19 @@ struct MultiSearchImpl const std::vector & needles, PaddedPODArray & res) { - Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res); + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); + const size_t haystack_string_size = haystack_offsets.size(); + while (searcher.hasMoreToSearch()) + { + size_t prev_offset = 0; + for (size_t j = 0; j < haystack_string_size; ++j) + { + const auto * haystack = &haystack_data[prev_offset]; + const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; + res[j] = searcher.searchOne(haystack, haystack_end); + prev_offset = haystack_offsets[j]; + } + } } }; @@ -343,7 +374,19 @@ struct MultiSearchFirstPositionImpl { return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); }; - Impl::createMultiSearcherInBigHaystack(needles).searchFirstPosition(haystack_data, haystack_offsets, res_callback, res); + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); + const size_t haystack_string_size = haystack_offsets.size(); + while (searcher.hasMoreToSearch()) + { + size_t prev_offset = 0; + for (size_t j = 0; j < haystack_string_size; ++j) + { + const auto * haystack = &haystack_data[prev_offset]; + const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; + res[j] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback); + prev_offset = haystack_offsets[j]; + } + } } }; @@ -359,7 +402,19 @@ struct MultiSearchFirstIndexImpl const std::vector & needles, PaddedPODArray & res) { - Impl::createMultiSearcherInBigHaystack(needles).searchIndex(haystack_data, haystack_offsets, res); + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); + const size_t haystack_string_size = haystack_offsets.size(); + while (searcher.hasMoreToSearch()) + { + size_t prev_offset = 0; + for (size_t j = 0; j < haystack_string_size; ++j) + { + const auto * haystack = &haystack_data[prev_offset]; + const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; + res[j] = searcher.searchOneFirstIndex(haystack, haystack_end); + prev_offset = haystack_offsets[j]; + } + } } }; -- GitLab