未验证 提交 06e79a76 编写于 作者: A alexey-milovidov 提交者: GitHub

Merge pull request #4780 from danlark1/master

Hyperscan string search regular expression matching was added
......@@ -76,3 +76,6 @@
[submodule "contrib/brotli"]
path = contrib/brotli
url = https://github.com/google/brotli.git
[submodule "contrib/hyperscan"]
path = contrib/hyperscan
url = https://github.com/ClickHouse-Extras/hyperscan.git
......@@ -318,6 +318,7 @@ include (cmake/find_pdqsort.cmake)
include (cmake/find_hdfs3.cmake) # uses protobuf
include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
include (cmake/find_hyperscan.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)
......
if (HAVE_SSSE3)
set (HYPERSCAN_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/hyperscan/src)
set (HYPERSCAN_LIBRARY hs)
set (USE_HYPERSCAN 1)
set (USE_INTERNAL_HYPERSCAN_LIBRARY 1)
message (STATUS "Using hyperscan: ${HYPERSCAN_INCLUDE_DIR} " : ${HYPERSCAN_LIBRARY})
endif()
......@@ -304,3 +304,7 @@ endif ()
if (USE_BASE64)
add_subdirectory (base64-cmake)
endif()
if (USE_HYPERSCAN)
add_subdirectory (hyperscan)
endif()
Subproject commit 6a96e8b59f76148eb8ad54a9d15259f8ce84c606
Subproject commit 32abf16beb7bb8b243a4d100ccdd6acb271738c4
Subproject commit 05dab0efee80be405aad5f74721b692b6889b75e
......@@ -437,10 +437,10 @@ public:
}
template <typename ResultType, typename AnsCallback>
void searchAll(
void searchAllPositions(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const AnsCallback & ansCallback,
const AnsCallback & ans_callback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
......@@ -461,7 +461,7 @@ public:
{
const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
if (ptr != haystack_end)
ans[from + fallback_needles[i]] = ansCallback(haystack, ptr);
ans[from + fallback_needles[i]] = ans_callback(haystack, ptr);
}
/// check if we have one non empty volnitsky searcher
......@@ -481,7 +481,7 @@ public:
{
if (fallback_searchers[ind].compare(res))
{
ans[from + ind] = ansCallback(haystack, res);
ans[from + ind] = ans_callback(haystack, res);
}
}
}
......@@ -513,6 +513,16 @@ public:
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
template <typename ResultType, typename CountCharsCallback>
void searchFirstPosition(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const CountCharsCallback & count_chars_callback, ResultType & ans)
{
auto callback = [this, &count_chars_callback](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t
{
return this->searchOneFirstPosition(haystack, haystack_end, count_chars_callback);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
private:
/**
* This function is needed to initialize hash table
......@@ -582,7 +592,7 @@ private:
inline void searchInternal(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const OneSearcher & searchFallback,
const OneSearcher & search_fallback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
......@@ -593,7 +603,7 @@ private:
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
ans[j] = searchFallback(haystack, haystack_end);
ans[j] = search_fallback(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
......@@ -665,6 +675,41 @@ private:
return ans + 1;
}
template <typename CountCharsCallback>
inline size_t searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & callback) const
{
const size_t fallback_size = fallback_needles.size();
size_t ans = std::numeric_limits<size_t>::max();
for (size_t i = 0; i < fallback_size; ++i)
if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end)
ans = std::min(ans, callback(haystack, pos));
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
ans = std::min(ans, callback(haystack, res));
}
}
}
}
if (ans == std::numeric_limits<size_t>::max())
return 0;
return ans;
}
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
{
size_t cell_num = ngram % VolnitskyTraits::hash_size;
......
......@@ -64,3 +64,8 @@ if (USE_XXHASH)
target_link_libraries(clickhouse_functions PRIVATE ${XXHASH_LIBRARY})
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${XXHASH_INCLUDE_DIR})
endif()
if (USE_HYPERSCAN)
target_link_libraries (clickhouse_functions PRIVATE ${HYPERSCAN_LIBRARY})
target_include_directories (clickhouse_functions SYSTEM PRIVATE ${HYPERSCAN_INCLUDE_DIR})
endif ()
......@@ -15,6 +15,10 @@
#include <algorithm>
#include <memory>
#ifdef __SSSE3__
# include <hs.h>
#endif
#if USE_RE2_ST
# include <re2_st/re2.h> // Y_IGNORE
#else
......@@ -312,7 +316,7 @@ struct PositionImpl
};
template <typename Impl>
struct MultiPositionImpl
struct MultiSearchAllPositionsImpl
{
using ResultType = UInt64;
......@@ -322,17 +326,31 @@ struct MultiPositionImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
auto resCallback = [](const UInt8 * start, const UInt8 * end) -> UInt64
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchAll(haystack_data, haystack_offsets, resCallback, res);
Impl::createMultiSearcherInBigHaystack(needles).searchAllPositions(haystack_data, haystack_offsets, res_callback, res);
}
};
template <typename Impl>
struct MultiSearchImpl
{
using ResultType = UInt8;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt8> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
}
};
template <typename Impl>
struct MultiSearchFirstPositionImpl
{
using ResultType = UInt64;
......@@ -342,12 +360,16 @@ struct MultiSearchImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchFirstPosition(haystack_data, haystack_offsets, res_callback, res);
}
};
template <typename Impl>
struct FirstMatchImpl
struct MultiSearchFirstIndexImpl
{
using ResultType = UInt64;
......@@ -524,8 +546,8 @@ struct MatchImpl
res[i] = !revert;
else
{
const char * str_data = reinterpret_cast<const char *>(&data[i != 0 ? offsets[i - 1] : 0]);
size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1;
const char * str_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1] - 1;
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the string several times,
......@@ -581,6 +603,78 @@ struct MatchImpl
};
template <typename Type, bool FindAny, bool FindAnyIndex>
struct MultiMatchAnyImpl
{
static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1);
using ResultType = Type;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<Type> & res)
{
(void)FindAny;
(void)FindAnyIndex;
#ifdef __SSSE3__
using ScratchPtr = std::unique_ptr<hs_scratch_t, DB::MultiRegexps::HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex>(needles);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_alloc_scratch(hyperscan_regex->get(), &scratch);
if (err != HS_SUCCESS)
throw Exception("Could not allocate scratch space for hyperscan.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
ScratchPtr smart_scratch(scratch);
auto on_match = []([[maybe_unused]] unsigned int id,
unsigned long long /* from */,
unsigned long long /* to */,
unsigned int /* flags */,
void * context) -> int
{
if constexpr (FindAnyIndex)
*reinterpret_cast<Type *>(context) = id;
else if constexpr (FindAny)
*reinterpret_cast<Type *>(context) = 1;
return 0;
};
const size_t haystack_offsets_size = haystack_offsets.size();
size_t offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
res[i] = 0;
hs_scan(
hyperscan_regex->get(),
reinterpret_cast<const char *>(haystack_data.data()) + offset,
haystack_offsets[i] - offset - 1,
0,
smart_scratch.get(),
on_match,
&res[i]);
offset = haystack_offsets[i];
}
#else
/// Fallback if not an intel processor
PaddedPODArray<UInt8> accum(res.size());
memset(res.data(), 0, res.size() * sizeof(res.front()));
memset(accum.data(), 0, accum.size());
for (size_t j = 0; j < needles.size(); ++j)
{
MatchImpl<false, false>::vector_constant(haystack_data, haystack_offsets, needles[j].toString(), accum);
for (size_t i = 0; i < res.size(); ++i)
{
if constexpr (FindAny)
res[i] |= accum[i];
else if (accum[i])
res[i] = j + 1;
}
}
#endif // __SSSE3__
}
};
struct ExtractImpl
{
static void vector(
......@@ -1090,53 +1184,69 @@ struct NamePositionCaseInsensitiveUTF8
{
static constexpr auto name = "positionCaseInsensitiveUTF8";
};
struct NameMultiPosition
struct NameMultiSearchAllPositions
{
static constexpr auto name = "multiSearchAllPositions";
};
struct NameMultiSearchAllPositionsUTF8
{
static constexpr auto name = "multiPosition";
static constexpr auto name = "multiSearchAllPositionsUTF8";
};
struct NameMultiPositionUTF8
struct NameMultiSearchAllPositionsCaseInsensitive
{
static constexpr auto name = "multiPositionUTF8";
static constexpr auto name = "multiSearchAllPositionsCaseInsensitive";
};
struct NameMultiPositionCaseInsensitive
struct NameMultiSearchAllPositionsCaseInsensitiveUTF8
{
static constexpr auto name = "multiPositionCaseInsensitive";
static constexpr auto name = "multiSearchAllPositionsCaseInsensitiveUTF8";
};
struct NameMultiPositionCaseInsensitiveUTF8
struct NameMultiSearchAny
{
static constexpr auto name = "multiPositionCaseInsensitiveUTF8";
static constexpr auto name = "multiSearchAny";
};
struct NameMultiSearch
struct NameMultiSearchAnyUTF8
{
static constexpr auto name = "multiSearch";
static constexpr auto name = "multiSearchAnyUTF8";
};
struct NameMultiSearchUTF8
struct NameMultiSearchAnyCaseInsensitive
{
static constexpr auto name = "multiSearchUTF8";
static constexpr auto name = "multiSearchAnyCaseInsensitive";
};
struct NameMultiSearchCaseInsensitive
struct NameMultiSearchAnyCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchCaseInsensitive";
static constexpr auto name = "multiSearchAnyCaseInsensitiveUTF8";
};
struct NameMultiSearchCaseInsensitiveUTF8
struct NameMultiSearchFirstIndex
{
static constexpr auto name = "multiSearchCaseInsensitiveUTF8";
static constexpr auto name = "multiSearchFirstIndex";
};
struct NameFirstMatch
struct NameMultiSearchFirstIndexUTF8
{
static constexpr auto name = "firstMatch";
static constexpr auto name = "multiSearchFirstIndexUTF8";
};
struct NameFirstMatchUTF8
struct NameMultiSearchFirstIndexCaseInsensitive
{
static constexpr auto name = "firstMatchUTF8";
static constexpr auto name = "multiSearchFirstIndexCaseInsensitive";
};
struct NameFirstMatchCaseInsensitive
struct NameMultiSearchFirstIndexCaseInsensitiveUTF8
{
static constexpr auto name = "firstMatchCaseInsensitive";
static constexpr auto name = "multiSearchFirstIndexCaseInsensitiveUTF8";
};
struct NameFirstMatchCaseInsensitiveUTF8
struct NameMultiSearchFirstPosition
{
static constexpr auto name = "firstMatchCaseInsensitiveUTF8";
static constexpr auto name = "multiSearchFirstPosition";
};
struct NameMultiSearchFirstPositionUTF8
{
static constexpr auto name = "multiSearchFirstPositionUTF8";
};
struct NameMultiSearchFirstPositionCaseInsensitive
{
static constexpr auto name = "multiSearchFirstPositionCaseInsensitive";
};
struct NameMultiSearchFirstPositionCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchFirstPositionCaseInsensitiveUTF8";
};
struct NameMatch
{
......@@ -1150,6 +1260,14 @@ struct NameNotLike
{
static constexpr auto name = "notLike";
};
struct NameMultiMatchAny
{
static constexpr auto name = "multiMatchAny";
};
struct NameMultiMatchAnyIndex
{
static constexpr auto name = "multiMatchAnyIndex";
};
struct NameExtract
{
static constexpr auto name = "extract";
......@@ -1177,28 +1295,37 @@ using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<Posit
using FunctionPositionCaseInsensitiveUTF8
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
using FunctionMultiPosition = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveASCII>, NameMultiPosition>;
using FunctionMultiPositionUTF8 = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveUTF8>, NameMultiPositionUTF8>;
using FunctionMultiPositionCaseInsensitive
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveASCII>, NameMultiPositionCaseInsensitive>;
using FunctionMultiPositionCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiPositionCaseInsensitiveUTF8>;
using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
using FunctionMultiSearchAllPositionsCaseInsensitive
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearch>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
using FunctionMultiSearchCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchCaseInsensitive>;
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
using FunctionMultiSearchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchCaseInsensitiveUTF8>;
using FunctionFirstMatch = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveASCII>, NameFirstMatch>;
using FunctionFirstMatchUTF8 = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveUTF8>, NameFirstMatchUTF8>;
using FunctionFirstMatchCaseInsensitive
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveASCII>, NameFirstMatchCaseInsensitive>;
using FunctionFirstMatchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveUTF8>, NameFirstMatchCaseInsensitiveUTF8>;
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
using FunctionMultiSearchFirstIndexCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
using FunctionMultiSearchFirstPositionCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionMultiMatchAny = FunctionsMultiStringSearch<MultiMatchAnyImpl<UInt8, true, false>, NameMultiMatchAny, std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<MultiMatchAnyImpl<UInt64, false, true>, NameMultiMatchAnyIndex, std::numeric_limits<UInt32>::max()>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
......@@ -1220,26 +1347,34 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionPositionCaseInsensitive>();
factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiPosition>();
factory.registerFunction<FunctionMultiPositionUTF8>();
factory.registerFunction<FunctionMultiPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearchAllPositions>();
factory.registerFunction<FunctionMultiSearchAllPositionsUTF8>();
factory.registerFunction<FunctionMultiSearchAllPositionsCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchAllPositionsCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearch>();
factory.registerFunction<FunctionMultiSearchUTF8>();
factory.registerFunction<FunctionMultiSearchCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionFirstMatch>();
factory.registerFunction<FunctionFirstMatchUTF8>();
factory.registerFunction<FunctionFirstMatchCaseInsensitive>();
factory.registerFunction<FunctionFirstMatchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearchFirstIndex>();
factory.registerFunction<FunctionMultiSearchFirstIndexUTF8>();
factory.registerFunction<FunctionMultiSearchFirstIndexCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchFirstIndexCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearchFirstPosition>();
factory.registerFunction<FunctionMultiSearchFirstPositionUTF8>();
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>();
factory.registerFunction<FunctionMultiMatchAny>();
factory.registerFunction<FunctionMultiMatchAnyIndex>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
}
......
......@@ -26,6 +26,8 @@ namespace DB
* notLike(haystack, pattern)
*
* match(haystack, pattern) - search by regular expression re2; Returns 0 or 1.
* multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches.
* multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none;
*
* Applies regexp re2 and pulls:
* - the first subpattern, if the regexp has a subpattern;
......@@ -39,20 +41,25 @@ namespace DB
* replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence.
* replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences.
*
* multiPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurrences (positions) of all the const patterns inside haystack
* multiPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositions(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurrences (positions) of all the const patterns inside haystack
* multiSearchAllPositionsUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositionsCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositionsCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first position of the haystack matched by strings or zero if nothing was found
* multiSearchFirstPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*
* multiSearch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
* multiSearchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
* firstMatchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
* multiSearchAnyUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAnyCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAnyCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
* multiSearchFirstIndexUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstIndexCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstIndexCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*/
namespace ErrorCodes
......@@ -269,9 +276,13 @@ public:
}
};
template <typename Impl, typename Name>
/// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number.
/// But some other searchers use this function, for example, multiMatchAny -- hyperscan does not have such restrictions
template <typename Impl, typename Name, size_t LimitArgs = std::numeric_limits<UInt8>::max()>
class FunctionsMultiStringSearch : public IFunction
{
static_assert(LimitArgs > 0);
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringSearch>(); }
......@@ -282,10 +293,10 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
if (arguments.size() + 1 >= LimitArgs)
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
+ ", should be at most " + std::to_string(LimitArgs) + ".",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
......@@ -333,6 +344,7 @@ public:
vec_res.resize(column_haystack_size);
/// TODO support constant_constant version
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else
......
#pragma once
#include <Common/OptimizedRegularExpression.h>
#include <Functions/likePatternToRegexp.h>
#include <Common/ObjectPool.h>
#include <Common/OptimizedRegularExpression.h>
#include <Common/ProfileEvents.h>
#include <Functions/likePatternToRegexp.h>
#include <common/StringRef.h>
#include <memory>
#include <string>
#include <vector>
#ifdef __SSSE3__
# include <hs.h>
#endif
namespace ProfileEvents
{
extern const Event RegexpCreated;
extern const Event RegexpCreated;
}
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int LOGICAL_ERROR;
}
namespace Regexps
{
......@@ -21,16 +34,22 @@ namespace Regexps
using Pool = ObjectPoolMap<Regexp, String>;
template <bool like>
inline Regexp createRegexp(const std::string & pattern, int flags) { return {pattern, flags}; }
inline Regexp createRegexp(const std::string & pattern, int flags)
{
return {pattern, flags};
}
template <>
inline Regexp createRegexp<true>(const std::string & pattern, int flags) { return {likePatternToRegexp(pattern), flags}; }
inline Regexp createRegexp<true>(const std::string & pattern, int flags)
{
return {likePatternToRegexp(pattern), flags};
}
template <bool like, bool no_capture>
inline Pool::Pointer get(const std::string & pattern)
{
/// C++11 has thread-safe function-local statics on most modern compilers.
static Pool known_regexps; /// Different variables for different pattern parameters.
static Pool known_regexps; /// Different variables for different pattern parameters.
return known_regexps.get(pattern, [&pattern]
{
......@@ -44,4 +63,82 @@ namespace Regexps
}
}
#ifdef __SSSE3__
namespace MultiRegexps
{
template <typename Deleter, Deleter deleter>
struct HyperscanDeleter
{
template <typename T>
void operator()(T * ptr) const
{
deleter(ptr);
}
};
using Regexps = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
using Pool = ObjectPoolMap<Regexps, std::vector<String>>;
template <bool FindAnyIndex>
inline Pool::Pointer get(const std::vector<StringRef> & patterns)
{
/// C++11 has thread-safe function-local statics on most modern compilers.
static Pool known_regexps; /// Different variables for different pattern parameters.
std::vector<String> str_patterns;
str_patterns.reserve(patterns.size());
for (const StringRef & ref : patterns)
str_patterns.push_back(ref.toString());
return known_regexps.get(str_patterns, [&str_patterns]
{
std::vector<const char *> ptrns;
std::vector<unsigned int> flags;
ptrns.reserve(str_patterns.size());
flags.reserve(str_patterns.size());
for (const StringRef ref : str_patterns)
{
ptrns.push_back(ref.data);
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH);
}
hs_database_t * db = nullptr;
hs_compile_error_t * compile_error;
std::unique_ptr<unsigned int[]> ids;
if constexpr (FindAnyIndex)
{
ids.reset(new unsigned int[ptrns.size()]);
for (size_t i = 0; i < ptrns.size(); ++i)
ids[i] = i + 1;
}
hs_error_t err
= hs_compile_multi(ptrns.data(), flags.data(), ids.get(), ptrns.size(), HS_MODE_BLOCK, nullptr, &db, &compile_error);
if (err != HS_SUCCESS)
{
std::unique_ptr<
hs_compile_error_t,
HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>> error(compile_error);
if (error->expression < 0)
throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR);
else
throw Exception(
"Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message),
ErrorCodes::LOGICAL_ERROR);
}
ProfileEvents::increment(ProfileEvents::RegexpCreated);
return new Regexps{db};
});
}
}
#endif // __SSSE3__
}
......@@ -23,22 +23,36 @@
</stop_conditions>
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiPosition(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiSearchAllPositions(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiMatchAny(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatchAny(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|yahoo|pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatchAny(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'facebook')), sum(match(URL, 'wikipedia')), sum(match(URL, 'reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatchAny(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|facebook|wikipedia|reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(firstMatch(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchFirstIndex(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, 'about/address|for_woman|^https?://lm-company.ruy/$|ultimateguitar.com')]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, 'chelyabinsk.74.ru|doctor.74.ru|transport.74.ru|m.74.ru|//74.ru/|chel.74.ru|afisha.74.ru|diplom.74.ru|chelfin.ru|//chel.ru|chelyabinsk.ru|cheldoctor.ru|//mychel.ru|cheldiplom.ru|74.ru/video|market|poll|mail|conference|consult|contest|tags|feedback|pages|text')]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk\\.74\\.ru', 'doctor\\.74\\.ru', 'transport\\.74\\.ru', 'm\\.74\\.ru', '//74\\.ru/', 'chel\\.74\\.ru', 'afisha\\.74\\.ru', 'diplom\\.74\\.ru', 'chelfin\\.ru', '//chel\\.ru', 'chelyabinsk\\.ru', 'cheldoctor\\.ru', '//mychel\\.ru', 'cheldiplom\\.ru', '74\\.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiSearchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<main_metric>
<min_time/>
......
......@@ -23192,3 +23192,243 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
......@@ -16,7 +16,7 @@ $CLICKHOUSE_CLIENT -q "select getColumnStructure('abc');" 2>&1 | grep "Maybe you
$CLICKHOUSE_CLIENT -q "select gutColumnStructure('abc');" 2>&1 | grep "Maybe you meant: \['dumpColumnStructure'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select gupColumnStructure('abc');" 2>&1 | grep "Maybe you meant: \['dumpColumnStructure'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select provideColumnStructure('abc');" 2>&1 | grep "Maybe you meant: \['dumpColumnStructure'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiposicionutf7('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionUTF8','multiPosition'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiposicionutf7casesensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionCaseInsensitive'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiposicionutf7sensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionCaseInsensitive'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiPosicionSensitiveUTF8('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionCaseInsensitiveUTF8'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multisearchallposicionutf7('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAllPositionsUTF8','multiSearchAllPositions'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multisearchallposicionutf7casesensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAllPositionsCaseInsensitive','multiSearchAllPositionsCaseInsensitiveUTF8'\]." &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiSearchAllposicionutf7sensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAllPositionsCaseInsensitive','multiSearchAnyCaseInsensitive'\]." &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiSearchAllPosicionSensitiveUTF8('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAnyCaseInsensitiveUTF8','multiSearchAllPositionsCaseInsensitiveUTF8'\]." &>/dev/null;
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
select 0 = multiMatchAny(materialize('mpnsguhwsitzvuleiwebwjfitmsg'), ['wbirxqoabpblrnvvmjizj', 'cfcxhuvrexyzyjsh', 'oldhtubemyuqlqbwvwwkwin', 'bumoozxdkjglzu', 'intxlfohlxmajjomw', 'dxkeghohv', 'arsvmwwkjeopnlwnan', 'ouugllgowpqtaxslcopkytbfhifaxbgt', 'hkedmjlbcrzvryaopjqdjjc', 'tbqkljywstuahzh', 'o', 'wowoclosyfcuwotmvjygzuzhrery', 'vpefjiffkhlggntcu', 'ytdixvasrorhripzfhjdmlhqksmctyycwp']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('qjjzqexjpgkglgxpzrbqbnskq'), ['vaiatcjacmlffdzsejpdareqzy', 'xspcfzdufkmecud', 'bcvtbuqtctq', 'nkcopwbfytgemkqcfnnno', 'dylxnzuyhq', 'tno', 'scukuhufly', 'cdyquzuqlptv', 'ohluyfeksyxepezdhqmtfmgkvzsyph', 'ualzwtahvqvtijwp', 'jg', 'gwbawqlngzcknzgtmlj', 'qimvjcgbkkp', 'eaedbcgyrdvv', 'qcwrncjoewwedyyewcdkh', 'uqcvhngoqngmitjfxpznqomertqnqcveoqk', 'ydrgjiankgygpm', 'axepgap']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('fdkmtqmxnegwvnjhghjq'), ['vynkybvdmhgeezybbdqfrukibisj', 'knazzamgjjpavwhvdkwigykh', 'peumnifrmdhhmrqqnemw', 'lmsnyvqoisinlaqobxojlwfbi', 'oqwfzs', 'dymudxxeodwjpgbibnkvr', 'vomtfsnizkplgzktqyoiw', 'yoyfuhlpgrzds', 'cefao', 'gi', 'srpgxfjwl', 'etsjusdeiwbfe', 'ikvtzdopxo', 'ljfkavrau', 'soqdhxtenfrkmeic', 'ktprjwfcelzbup', 'pcvuoddqwsaurcqdtjfnczekwni', 'agkqkqxkfbkfgyqliahsljim']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('khljxzxlpcrxpkrfybbfk'), ['', 'lpc', 'rxpkrfybb', 'crxp', '', 'pkr', 'jxzxlpcrxpkrf', '', 'xzxlpcr', 'xpk', 'fyb', 'xzxlpcrxpkrfybbfk', 'k', 'lpcrxp', 'ljxzxlpcr', 'r', 'pkr', 'fk']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('rbrizgjbigvzfnpgmpkqxoqxvdj'), ['ee', 'cohqnb', 'msol', 'yhlujcvhklnhuomy', 'ietn', 'vgmnlkcsybtokrepzrm', 'wspiryefojxysgrzsxyrluykxfnnbzdstcel', 'mxisnsivndbefqxwznimwgazuulupbaihavg', 'vpzdjvqqeizascxmzdhuq', 'pgvncohlxcqjhfkm', 'mbaypcnfapltsegquurahlsruqvipfhrhq', 'ioxjbcyyqujfveujfhnfdfokfcrlsincjbdt', 'cnvlujyowompdrqjwjx', 'wobwed', 'kdfhaoxiuifotmptcmdbk', 'leoamsnorcvtlmokdomkzuo', 'jjw', 'ogugysetxuqmvggneosbsfbonszepsatq']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('uymwxzyjbfegbhgswiqhinf'), ['lizxzbzlwljkr', 'ukxygktlpzuyijcqeqktxenlaqi', 'onperabgbdiafsxwbvpjtyt', 'xfqgoqvhqph', 'aflmcwabtwgmajmmqelxwkaolyyhmdlc', 'yfz', 'meffuiaicvwed', 'hhzvgmifzamgftkifaeowayjrnnzw', 'nwewybtajv', 'ectiye', 'epjeiljegmqqjncubj', 'zsjgftqjrn', 'pssng', 'raqoarfhdoeujulvqmdo']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('omgghgnzjmecpzqmtcvw'), ['fjhlzbszodmzavzg', 'gfofrnwrxprkfiokv', 'jmjiiqpgznlmyrxwewzqzbe', 'pkyrsqkltlmxr', 'crqgkgqkkyujcyoc', 'endagbcxwqhueczuasykmajfsvtcmh', 'xytmxtrnkdysuwltqomehddp', 'etmdxyyfotfyifwvbykghijvwv', 'mwqtgrncyhkfhjdg', 'iuvymofrqpp', 'pgllsdanlhzqhkstwsmzzftp', 'disjylcceufxtjdvhy']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('mznihnmshftvnmmhnrulizzpslq'), ['nrul', 'mshftvnmmhnr', 'z', 'mhnrulizzps', 'hftvnmmhnrul', 'ihnmshftvnmmhnrulizzp', 'izz', '', 'uli', 'nihnmshftvnmmhnru', 'hnrulizzp', 'nrulizz']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('ruqmqrsxrbftvruvahonradau'), ['uqmqrsxrbft', 'ftv', 'tvruvahonrad', 'mqrsxrbftvruvahon', 'rbftvruvah', 'qrsxrbftvru', 'o', 'ahonradau', 'a', 'ft', '', 'u', 'rsxrbftvruvahonradau', 'ruvahon', 'bftvruvahonradau', 'qrsxrbftvru', 't', 'vahonrada', 'vruvahonradau', 'onra']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('gpsevxtcoeexrltyzduyidmtzxf'), ['exrltyzduyid', 'vxtcoeexrltyz', 'xr', 'ltyzduyidmt', 'yzduy', 'exr', 'coeexrltyzduy', 'coeexrltyzduy', 'rlty', 'rltyzduyidm', 'exrltyz', 'xtcoeexrlty', 'vxtcoeexrltyzduyidm', '', 'coeexrl', 'sevxtcoeexrltyzdu', 'dmt', '']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('dyhycfhzyewaikgursyxfkuv'), ['sktnofpugrmyxmbizzrivmhn', 'fhlgadpoqcvktbfzncxbllvwutdawmw', 'eewzjpcgzrqmltbgmhafwlwqb', 'tpogbkyj', 'rtllntxjgkzs', 'mirbvsqexscnzglogigbujgdwjvcv', 'iktwpgjsakemewmahgqza', 'xgfvzkvqgiuoihjjnxwwpznxhz', 'nxaumpaknreklbwynvxdsmatjekdlxvklh', 'zadzwqhgfxqllihuudozxeixyokhny', 'tdqpgfpzexlkslodps', 'slztannufxaabqfcjyfquafgfhfb', 'xvjldhfuwurvkb', 'aecv', 'uycfsughpikqsbcmwvqygdyexkcykhbnau', 'jr']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('vbcsettndwuntnruiyclvvwoo'), ['dwuntnru', '', 'ttndwuntnruiyclvv', 'ntnr', 'nruiyclvvw', 'wo', '', 'bcsettndwuntnruiycl', 'yc', 'untnruiyclvvw', 'csettndwuntnr', 'ntnruiyclvvwo']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('pqqnugshlczcuxhpjxjbcnro'), ['dpeedqy', 'rtsc', 'jdgla', 'qkgudqjiyzvlvsj', 'xmfxawhijgxxtydbd', 'ebgzazqthb', 'wyrjhvhwzhmpybnylirrn', 'iviqbyuclayqketooztwegtkgwnsezfl', 'bhvidy', 'hijctxxweboq', 't', 'osnzfbziidteiaifgaanm']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('loqchlxspwuvvccucskuytr'), ['', 'k', 'qchlxspwu', 'u', 'hlxspwuvv', 'wuvvccucsku', 'vcc', 'uyt', 'uvv', 'spwu', 'ytr', 'wuvvccucs', 'xspwuv', 'lxspwuvvccuc', 'spwuvvccu', 'oqchlxspwuvvccucskuy']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('pjjyzupzwllshlnatiujmwvaofr'), ['lnatiujmwvao', '', 'zupzwllsh', 'nati', 'wllshl', 'hlnatiujmwv', 'mwvao', 'shlnat', 'ati', 'wllshlnatiujmwvao', 'wllshlnatiujmwvaofr', 'nat']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('iketunkleyaqaxdlocci'), ['nkleyaqaxd', 'etunkleyaq', 'yaqaxdlocci', 'tunkleyaq', 'eyaqaxdlocc', 'leyaq', 'nkleyaqaxdl', 'tunkleya', 'kleyaqa', 'etunkleya', 'leyaqa', 'dlo', 'yaqa', 'leyaqaxd', 'etunkleyaq', '']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('drqianqtangmgbdwruvblkqd'), ['wusajejyucamkyl', 'wsgibljugzrpkniliy', 'lhwqqiuafwffyersqjgjvvvfurx', 'jfokpzzxfdonelorqu', 'ccwkpcgac', 'jmyulqpndkmzbfztobwtm', 'rwrgfkccgxht', 'ggldjecrgbngkonphtcxrkcviujihidjx', 'spwweavbiokizv', 'lv', 'krb', 'vstnhvkbwlqbconaxgbfobqky', 'pvxwdc', 'thrl', 'ahsblffdveamceonqwrbeyxzccmux', 'yozji', 'oejtaxwmeovtqtz', 'zsnzznvqpxdvdxhznxrjn', 'hse', 'kcmkrccxmljzizracxwmpoaggywhdfpxkq']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('yasnpckniistxcejowfijjsvkdajz'), ['slkpxhtsmrtvtm', 'crsbq', 'rdeshtxbfrlfwpsqojassxmvlfbzefldavmgme', 'ipetilcbpsfroefkjirquciwtxhrimbmwnlyv', 'knjpwkmdwbvdbapuyqbtsw', 'horueidziztxovqhsicnklmharuxhtgrsr', 'ofohrgpz', 'oneqnwyevbaqsonrcpmxcynflojmsnix', 'shg', 'nglqzczevgevwawdfperpeytuodjlf']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('ueptpscfgxhplwsueckkxs'), ['ohhygchclbpcdwmftperprn', 'dvpjdqmqckekndvcerqrpkxen', 'lohhvarnmyi', 'zppd', 'qmqxgfewitsunbuhffozcpjtc', 'hsjbioisycsrawktqssjovkmltxodjgv', 'dbzuunwbkrtosyvctdujqtvaawfnvuq', 'gupbvpqthqxae', 'abjdmijaaiasnccgxttmqdsz', 'uccyumqoyqe', 'kxxliepyzlc', 'wbqcqtbyyjbqcgdbpkmzugksmcxhvr', 'piedxm', 'uncpphzoif', 'exkdankwck', 'qeitzozdrqopsergzr', 'hesgrhaftgesnzflrrtjdobxhbepjoas', 'wfpexx']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('ldrzgttlqaphekkkdukgngl'), ['gttlqaphekkkdukgn', 'ekkkd', 'gttlqaphe', 'qaphek', 'h', 'kdu', 'he', 'phek', '', 'drzgttlqaphekkkd']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('ololo'), ['ololo', 'ololo', 'ololo']);
SELECT 1 = multiMatchAny(materialize('khljxzxlpcrxpkrfybbfk'), ['k']);
select 1 = multiMatchAny(materialize(''), ['']);
select 0 = multiMatchAny(materialize(''), ['some string']);
select 1 = multiMatchAny(materialize('abc'), ['']);
select 1 = multiMatchAny(materialize('abc'), ['']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['defgh']);
select 0 = multiMatchAny(materialize('abc'), ['defg']);
select 0 = multiMatchAny(materialize('abc'), ['def']);
select 0 = multiMatchAny(materialize('abc'), ['de']);
select 0 = multiMatchAny(materialize('abc'), ['d']);
select 1 = multiMatchAny(materialize('abc'), ['b']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['bc']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcde']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcdef']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcdefg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcdefgh']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcdefg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcdef']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcde']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcd']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['abc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['ab']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['a']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcd'), ['c']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcd'), ['cd']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cde']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cdef']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cdefg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cdefgh']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['defgh']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['defg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['def']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['de']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['d']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['...']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\nbc'), ['a?bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\nbc'), ['a.bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\0bc'), ['a?bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\0bc'), ['a.bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcdef'), ['a.....']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcdef'), ['a......']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcdef'), ['a......', 'a.....']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['a\0d']) from system.numbers limit 10;
select 1 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google', 'yandex1']) from system.numbers limit 10;;
select 2 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google1', 'yandex']) from system.numbers limit 10;;
select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*']) from system.numbers limit 10;;
select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;;
SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']);
......@@ -10,6 +10,7 @@
| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) |
| FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
......
......@@ -15,23 +15,29 @@ The same as `position`, but the position is returned in Unicode code points. Wor
For a case-insensitive search, use the function `positionCaseInsensitiveUTF8`.
## multiPosition(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n])
The same as `position`, but returns `Array` of the `position`s for all `needle_i`.
For a case-insensitive search or/and in UTF-8 format use functions `multiPositionCaseInsensitive, multiPositionUTF8, multiPositionCaseInsensitiveUTF8`.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`.
## firstMatch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n])
Returns the index `i` (starting from 1) of the first found `needle_i` in the string `haystack` and 0 otherwise.
The same as `position` but returns the leftmost offset of the string `haystack` that is matched to some of the needles.
For a case-insensitive search or/and in UTF-8 format use functions `firstMatchCaseInsensitive, firstMatchUTF8, firstMatchCaseInsensitiveUTF8`.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`.
## multiSearch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n])
Returns the index `i` (starting from 1) of the leftmost found `needle_i` in the string `haystack` and 0 otherwise.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`.
## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n])
Returns 1, if at least one string `needle_i` matches the string `haystack` and 0 otherwise.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchCaseInsensitive, multiSearchUTF8, multiSearchCaseInsensitiveUTF8`.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
## match(haystack, pattern)
......@@ -44,6 +50,16 @@ Note that the backslash symbol (`\`) is used for escaping in the regular express
The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes.
For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster.
## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n])
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) algorithm. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.
Note: this function is in experimental mode because of some [issues](https://github.com/intel/hyperscan/issues/141).
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAny`, but returns any index that matches the haystack.
## extract(haystack, pattern)
Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern.
......
# Используемые сторонние библиотеки
# Используемые сторонние библиотеки
| Библиотека | Лицензия |
| ------- | ------- |
......@@ -10,6 +10,7 @@
| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) |
| FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
......
......@@ -13,20 +13,26 @@
Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`.
## multiPosition(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n])
Так же, как и `position`, только возвращает `Array` первых вхождений.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiPositionCaseInsensitive, multiPositionUTF8, multiPositionCaseInsensitiveUTF8`.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`.
## firstMatch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n])
Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`.
## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n])
Возвращает индекс `i` (нумерация с единицы) первой найденной строки `needle_i` в строке `haystack` и 0 иначе.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `firstMatchCaseInsensitive, firstMatchUTF8, firstMatchCaseInsensitiveUTF8`.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`.
## multiSearch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n])
Возвращает 1, если хотя бы одна подстрока `needle_i` нашлась в строке `haystack` и 0 иначе.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchCaseInsensitive, multiSearchUTF8, multiSearchCaseInsensitiveUTF8`.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
## match(haystack, pattern)
Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение **re2**. Синтаксис регулярных выражений **re2** является более ограниченным по сравнению с регулярными выражениями **Perl** ([подробнее](https://github.com/google/re2/wiki/Syntax)).
......@@ -37,6 +43,16 @@
Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты.
Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее.
## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется алгоритм [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее.
Примечание: эта функция находится пока в экспериментальном режиме из-за некоторых [проблем](https://github.com/intel/hyperscan/issues/141).
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
То же, что `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
## extract(haystack, pattern)
Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册