提交 4fd85b41 编写于 作者: D Danila Kutenin

Clearer interfaces of Searchers

上级 bb5239f1
......@@ -329,8 +329,7 @@ class StringSearcher<false, true> : private StringSearcherBase
private:
/// string to be searched for
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
const UInt8 * const needle_end;
/// lower and uppercase variants of the first character in `needle`
UInt8 l{};
UInt8 u{};
......@@ -345,7 +344,7 @@ private:
public:
StringSearcher(const char * const needle_, const size_t needle_size)
: needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_size{needle_size}
: needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_end{needle + needle_size}
{
if (0 == needle_size)
return;
......@@ -430,7 +429,7 @@ public:
const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
{
if (0 == needle_size)
if (needle == needle_end)
return haystack;
while (haystack < haystack_end)
......@@ -528,8 +527,7 @@ class StringSearcher<true, ASCII> : private StringSearcherBase
private:
/// string to be searched for
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
const UInt8 * const needle_end;
/// first character in `needle`
UInt8 first{};
......@@ -543,7 +541,7 @@ private:
public:
StringSearcher(const char * const needle_, const size_t needle_size)
: needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_size{needle_size}
: needle{reinterpret_cast<const UInt8 *>(needle_)}, needle_end{needle + needle_size}
{
if (0 == needle_size)
return;
......@@ -616,7 +614,7 @@ public:
const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
{
if (0 == needle_size)
if (needle == needle_end)
return haystack;
while (haystack < haystack_end)
......@@ -715,10 +713,9 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
struct LibCASCIICaseSensitiveStringSearcher
{
const char * const needle;
const size_t needle_size;
LibCASCIICaseSensitiveStringSearcher(const char * const needle, const size_t needle_size)
: needle(needle), needle_size(needle_size) {}
LibCASCIICaseSensitiveStringSearcher(const char * const needle, const size_t /* needle_size */)
: needle(needle) {}
const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
{
......@@ -737,10 +734,9 @@ struct LibCASCIICaseSensitiveStringSearcher
struct LibCASCIICaseInsensitiveStringSearcher
{
const char * const needle;
const size_t needle_size;
LibCASCIICaseInsensitiveStringSearcher(const char * const needle, const size_t needle_size)
: needle(needle), needle_size(needle_size) {}
LibCASCIICaseInsensitiveStringSearcher(const char * const needle, const size_t /* needle_size */)
: needle(needle) {}
const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
{
......
......@@ -4,7 +4,6 @@
#include <vector>
#include <stdint.h>
#include <string.h>
#include <Columns/ColumnString.h>
#include <Core/Types.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h>
......@@ -345,6 +344,7 @@ public:
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
/// And also adding from the end guarantees that we will find first occurence because we will lookup bigger offsets first.
for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + 1, this->needle, callback);
}
......@@ -436,94 +436,6 @@ public:
fallback_searchers.reserve(needles.size());
}
template <typename ResultType, typename AnsCallback>
void searchAllPositions(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const AnsCallback & ans_callback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
const size_t needles_size = needles.size();
/// something can be uninitialized after
std::fill(ans.begin(), ans.end(), 0);
while (!reset())
{
size_t fallback_size = fallback_needles.size();
size_t prev_offset = 0;
for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
for (size_t i = 0; i < fallback_size; ++i)
{
const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
if (ptr != haystack_end)
ans[from + fallback_needles[i]] = ans_callback(haystack, ptr);
}
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto * res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (ans[from + ind] == 0 && res + needles[ind].size <= haystack_end)
{
if (fallback_searchers[ind].compare(res))
{
ans[from + ind] = ans_callback(haystack, res);
}
}
}
}
}
}
prev_offset = haystack_offsets[j];
}
}
}
template <typename ResultType>
void search(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans)
{
auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> bool
{
return this->searchOne(haystack, haystack_end);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
template <typename ResultType>
void searchIndex(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans)
{
auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t
{
return this->searchOneIndex(haystack, haystack_end);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
template <typename ResultType, typename CountCharsCallback>
void searchFirstPosition(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const CountCharsCallback & count_chars_callback, ResultType & ans)
{
auto callback = [this, &count_chars_callback](const UInt8 * haystack, const UInt8 * haystack_end) -> UInt64
{
return this->searchOneFirstPosition(haystack, haystack_end, count_chars_callback);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
private:
/**
* This function is needed to initialize hash table
* Returns `true` if there is nothing to initialize
......@@ -532,15 +444,15 @@ private:
* We actually destroy the hash table and initialize it with uninitialized needles
* and search through the haystack again.
* The actual usage of this function is like this:
* while (!reset())
* while (hasMoreToSearch())
* {
* search inside the haystack with the known needles
* }
*/
bool reset()
bool hasMoreToSearch()
{
if (last == needles.size())
return true;
return false;
memset(hash, 0, sizeof(hash));
fallback_needles.clear();
......@@ -585,28 +497,7 @@ private:
}
fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
}
return false;
}
template <typename OneSearcher, typename ResultType>
inline void searchInternal(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const OneSearcher & search_fallback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
while (!reset())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
ans[j] = search_fallback(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
return true;
}
inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const
......@@ -638,7 +529,7 @@ private:
return false;
}
inline size_t searchOneIndex(const UInt8 * haystack, const UInt8 * haystack_end) const
inline size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const
{
const size_t fallback_size = fallback_needles.size();
......@@ -676,7 +567,7 @@ private:
}
template <typename CountCharsCallback>
inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & callback) const
inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const
{
const size_t fallback_size = fallback_needles.size();
......@@ -684,7 +575,7 @@ private:
for (size_t i = 0; i < fallback_size; ++i)
if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end)
ans = std::min(ans, callback(haystack, pos));
ans = std::min<UInt64>(ans, pos - haystack);
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
......@@ -700,14 +591,46 @@ private:
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
ans = std::min(ans, callback(haystack, res));
ans = std::min<UInt64>(ans, res - haystack);
}
}
}
}
if (ans == std::numeric_limits<UInt64>::max())
return 0;
return ans;
return count_chars(haystack, haystack + ans);
}
template <typename CountCharsCallback, typename AnsType>
inline void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * ans, const CountCharsCallback & count_chars) const
{
const size_t fallback_size = fallback_needles.size();
for (size_t i = 0; i < fallback_size; ++i)
{
const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
if (ptr != haystack_end)
ans[fallback_needles[i]] = count_chars(haystack, ptr);
}
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto * res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (ans[ind] == 0 && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
ans[ind] = count_chars(haystack, res);
}
}
}
}
}
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
......
......@@ -307,7 +307,26 @@ struct MultiSearchAllPositionsImpl
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchAllPositions(haystack_data, haystack_offsets, res_callback, res);
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
const size_t needles_size = needles.size();
/// Something can be uninitialized after the search itself
std::fill(res.begin(), res.end(), 0);
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
searcher.searchOneAll(haystack, haystack_end, res.data() + from, res_callback);
prev_offset = haystack_offsets[j];
}
}
}
};
......@@ -323,7 +342,19 @@ struct MultiSearchImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt8> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
res[j] = searcher.searchOne(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
}
};
......@@ -343,7 +374,19 @@ struct MultiSearchFirstPositionImpl
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchFirstPosition(haystack_data, haystack_offsets, res_callback, res);
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
res[j] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
prev_offset = haystack_offsets[j];
}
}
}
};
......@@ -359,7 +402,19 @@ struct MultiSearchFirstIndexImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).searchIndex(haystack_data, haystack_offsets, res);
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
res[j] = searcher.searchOneFirstIndex(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册