diff --git a/dbms/src/Common/OptimizedRegularExpression.cpp b/dbms/src/Common/OptimizedRegularExpression.cpp index 0b5f5c42b14007b9c38afaa1db82f2ed3cb59a89..e8132fc1cd6324c863dad23fe49a55576bbc4e46 100644 --- a/dbms/src/Common/OptimizedRegularExpression.cpp +++ b/dbms/src/Common/OptimizedRegularExpression.cpp @@ -1,7 +1,6 @@ #include #include - #define MIN_LENGTH_FOR_STRSTR 3 #define MAX_SUBPATTERNS 5 @@ -214,23 +213,38 @@ void OptimizedRegularExpressionImpl::analyze( /** We choose the non-alternative substring of the maximum length, among the prefixes, * or a non-alternative substring of maximum length. */ + + /// Tuning for typical usage domain + auto tuning_strings_condition = [](const std::string & str) + { + return str != "://" && str != "http://" && str != "www" && str != "Windows "; + }; size_t max_length = 0; Substrings::const_iterator candidate_it = trivial_substrings.begin(); for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it) { if (((it->second == 0 && candidate_it->second != 0) || ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length)) - /// Tuning for typical usage domain - && (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://"))) - && (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http"))) - && (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www"))) - && (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows ")))) + && tuning_strings_condition(it->first)) { max_length = it->first.size(); candidate_it = it; } } + /// If prefix is small, it won't be chosen + if (max_length == 0) + { + for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it) + { + if (it->first.size() > max_length && tuning_strings_condition(it->first)) + { + max_length = it->first.size(); + candidate_it = it; + } + } + } + if (max_length >= MIN_LENGTH_FOR_STRSTR) { required_substring = candidate_it->first;