未验证 提交 324dcf00 编写于 作者: A alexey-milovidov 提交者: GitHub

Merge pull request #5191 from yandex/regexp_extraction_fix

Regexp extraction fix for small prefixes
#include <Common/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
......@@ -214,23 +213,38 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
/** We choose the non-alternative substring of the maximum length, among the prefixes,
* or a non-alternative substring of maximum length.
*/
/// Tuning for typical usage domain
auto tuning_strings_condition = [](const std::string & str)
{
return str != "://" && str != "http://" && str != "www" && str != "Windows ";
};
size_t max_length = 0;
Substrings::const_iterator candidate_it = trivial_substrings.begin();
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (((it->second == 0 && candidate_it->second != 0)
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
/// Tuning for typical usage domain
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
&& tuning_strings_condition(it->first))
{
max_length = it->first.size();
candidate_it = it;
}
}
/// If prefix is small, it won't be chosen
if (max_length < MIN_LENGTH_FOR_STRSTR)
{
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (it->first.size() > max_length && tuning_strings_condition(it->first))
{
max_length = it->first.size();
candidate_it = it;
}
}
}
if (max_length >= MIN_LENGTH_FOR_STRSTR)
{
required_substring = candidate_it->first;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册