未验证 提交 3dea1c8c 编写于 作者: A Alexander Kuzmenkov 提交者: GitHub

Merge pull request #21946 from azat/tld-fixes

Fix cutToFirstSignificantSubdomainCustom()/firstSignificantSubdomainCustom() for 3+level domains
......@@ -90,7 +90,70 @@ struct ExtractFirstSignificantSubdomain
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
}
}
/// The difference with execute() is due to custom TLD list can have records of any level,
/// not only 2-nd level (like non-custom variant), so it requires more lookups.
template <class Lookup>
static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{
res_data = data;
res_size = 0;
Pos tmp;
size_t domain_length;
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
if (domain_length == 0)
return;
if (out_domain_end)
*out_domain_end = tmp + domain_length;
/// cut useless dot
if (tmp[domain_length - 1] == '.')
--domain_length;
res_data = tmp;
res_size = domain_length;
auto begin = tmp;
auto end = begin + domain_length;
const char * last_2_periods[2]{};
const char * prev = begin - 1;
auto pos = find_first_symbols<'.'>(begin, end);
while (pos < end)
{
if (lookup(pos + 1, end - pos - 1))
{
res_data += prev + 1 - begin;
res_size = end - 1 - prev;
return;
}
last_2_periods[1] = last_2_periods[0];
last_2_periods[0] = pos;
prev = pos;
pos = find_first_symbols<'.'>(pos + 1, end);
}
/// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
if (!last_2_periods[0])
return;
/// if there is domain of the second level -> always return itself
if (!last_2_periods[1])
{
res_size = last_2_periods[0] - begin;
return;
}
/// if there is domain of the 3+ level, and zero records in TLD list ->
/// fallback to domain of the second level
res_data += last_2_periods[1] + 1 - begin;
res_size = last_2_periods[0] - last_2_periods[1] - 1;
}
};
}
......
......@@ -17,10 +17,10 @@ namespace ErrorCodes
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
struct FirstSignificantSubdomainCustomtLookup
struct FirstSignificantSubdomainCustomLookup
{
const TLDList & tld_list;
FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name)
FirstSignificantSubdomainCustomLookup(const std::string & tld_list_name)
: tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name))
{
}
......@@ -63,7 +63,7 @@ public:
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
{
const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue<String>());
FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue<String>());
/// FIXME: convertToFullColumnIfConst() is suboptimal
auto column = arguments[0].column->convertToFullColumnIfConst();
......@@ -79,7 +79,7 @@ public:
ErrorCodes::ILLEGAL_COLUMN);
}
static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup,
static void vector(FirstSignificantSubdomainCustomLookup & tld_lookup,
const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
{
......
......@@ -10,7 +10,7 @@ struct CutToFirstSignificantSubdomainCustom
{
static size_t getReserveLengthForElement() { return 15; }
static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
static void execute(FirstSignificantSubdomainCustomLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
......@@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
Pos tmp_data;
size_t tmp_length;
Pos domain_end;
ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0)
return;
......
no-tld
-- no-tld
foo.there-is-no-such-domain
foo.there-is-no-such-domain
foo.there-is-no-such-domain
foo.there-is-no-such-domain
foo
generic
-- generic
kernel
kernel.biz.ss
difference
-- difference
biz.ss
kernel.biz.ss
-- 3+level
xx.blogspot.co.at
blogspot
xx.blogspot.co.at
blogspot
-- url
foobar.com
foobar.com
foobar.com
xx.blogspot.co.at
select 'no-tld';
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
select '-- no-tld';
-- even if there is no TLD, 2-nd level by default anyway
-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
select cutToFirstSignificantSubdomain('there-is-no-such-domain');
select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain');
select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain');
select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
select 'generic';
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select '-- generic';
select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select 'difference';
select '-- difference';
-- biz.ss is not in the default TLD list, hence:
select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
select '-- 3+level';
select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
select '-- url';
select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list');
select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list');
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册