未验证 提交 87993de9 编写于 作者: A alexey-milovidov 提交者: GitHub

Merge pull request #5725 from yandex/PerformanceVision-ignore_scheme

Performance vision ignore scheme
......@@ -3,44 +3,117 @@
#include "protocol.h"
#include <common/find_symbols.h>
#include <cstring>
#include <Common/StringUtils/StringUtils.h>
namespace DB
{
namespace
{
inline StringRef checkAndReturnHost(const Pos & pos, const Pos & dot_pos, const Pos & start_of_host)
{
if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1)
return StringRef{};
auto after_dot = *(dot_pos + 1);
if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
return StringRef{};
return StringRef(start_of_host, pos - start_of_host);
}
}
/// Extracts host from given url.
inline StringRef getURLHost(const char * data, size_t size)
{
Pos pos = data;
Pos end = data + size;
if (end == (pos = find_first_symbols<'/'>(pos, end)))
return {};
if (pos != data)
if (*pos == '/' && *(pos + 1) == '/')
{
StringRef scheme = getURLScheme(data, size);
Pos scheme_end = data + scheme.size;
// Colon must follows after scheme.
if (pos - scheme_end != 1 || *scheme_end != ':')
return {};
pos += 2;
}
else
{
Pos scheme_end = data + std::min(size, 16UL);
for (++pos; pos < scheme_end; ++pos)
{
if (!isAlphaNumericASCII(*pos))
{
switch (*pos)
{
case '.':
case '-':
case '+':
break;
case ' ': /// restricted symbols
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return StringRef{};
default:
goto exloop;
}
}
}
exloop: if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
pos += 3;
else
pos = data;
}
if (end - pos < 2 || *(pos) != '/' || *(pos + 1) != '/')
return {};
pos += 2;
const char * start_of_host = pos;
Pos dot_pos = nullptr;
auto start_of_host = pos;
for (; pos < end; ++pos)
{
if (*pos == '@')
switch (*pos)
{
case '.':
dot_pos = pos;
break;
case ':': /// end symbols
case '/':
case '?':
case '#':
return checkAndReturnHost(pos, dot_pos, start_of_host);
case '@': /// myemail@gmail.com
start_of_host = pos + 1;
else if (*pos == ':' || *pos == '/' || *pos == '?' || *pos == '#')
break;
case ' ': /// restricted symbols in whole URL
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return StringRef{};
}
}
return (pos == start_of_host) ? StringRef{} : StringRef(start_of_host, pos - start_of_host);
return checkAndReturnHost(pos, dot_pos, start_of_host);
}
template <bool without_www>
......
canada congo net-domena
yandex yandex yandex яндекс yandex
canada hello hello hello hello hello canada canada
canada hello hello canada
......@@ -12,13 +12,17 @@ www.example.com
127.0.0.1
www.example.com
www.example.com
www.example.com
example.com
example.com
example.com
====DOMAIN====
com
ru
ru
com
com
com
====PATH====
П
......@@ -61,6 +65,8 @@ example.com
example.com
example.com
example.com
example.com
example.com
====CUT WWW====
http://example.com
http://example.com:1234
......
......@@ -13,6 +13,8 @@ SELECT domain('http://www.example.com?q=4') AS Host;
SELECT domain('http://127.0.0.1:443/') AS Host;
SELECT domain('//www.example.com') AS Host;
SELECT domain('//paul@www.example.com') AS Host;
SELECT domain('www.example.com') as Host;
SELECT domain('example.com') as Host;
SELECT domainWithoutWWW('//paul@www.example.com') AS Host;
SELECT domainWithoutWWW('http://paul@www.example.com:80/') AS Host;
......@@ -23,6 +25,8 @@ SELECT topLevelDomain('http://127.0.0.1:443/') AS Domain;
SELECT topLevelDomain('svn+ssh://example.ru?q=hello%20world') AS Domain;
SELECT topLevelDomain('svn+ssh://example.ru.?q=hello%20world') AS Domain;
SELECT topLevelDomain('//www.example.com') AS Domain;
SELECT topLevelDomain('www.example.com') as Domain;
SELECT topLevelDomain('example.com') as Domain;
SELECT '====PATH====';
SELECT decodeURLComponent('%D0%9F');
......@@ -69,6 +73,8 @@ SELECT cutToFirstSignificantSubdomain('http://www.example.com/a/b/c?a=b');
SELECT cutToFirstSignificantSubdomain('http://www.example.com/a/b/c?a=b#d=f');
SELECT cutToFirstSignificantSubdomain('http://paul@www.example.com/a/b/c?a=b#d=f');
SELECT cutToFirstSignificantSubdomain('//paul@www.example.com/a/b/c?a=b#d=f');
SELECT cutToFirstSignificantSubdomain('www.example.com');
SELECT cutToFirstSignificantSubdomain('example.com');
SELECT '====CUT WWW====';
SELECT cutWWW('http://www.example.com');
......
yandex.ru 25107 25107
21999 21999
public_search 16749 16749
89348 89348
yandex.ru 25105 25105
avito.ru 16523 16523
public 15429 15429
mail.yandex.ru 13663 13663
yandsearch 10039 10039
news 8827 8827
mail.yandex.ru 13659 13659
mail.ru 7643 7643
doc 7537 7537
auto.ru 7350 7350
hurpass.com 6395 6395
best.ru 5477 5477
tv.yandex.ru 5341 5341
korer.ru 4967 4967
mail.yandsearch 4246 4246
cars 4077 4077
publ 3970 3970
yandex 3845 3845
main=hurriyet.com 3806 3806
yandex.ua 3803 3803
mail.yandsearch 4237 4237
yandex.ua 3802 3802
korablitz.ru 3717 3717
uyelik.hurriyet.com 3584 3584
e.mail.ru 3508 3508
......@@ -28,46 +19,32 @@ coccoc.com 2707 2707
rutube.ru 2699 2699
rbc.ru 2644 2644
mamba.ru 2598 2598
video 2558 2558
mail.yandex 2447 2447
wot 2253 2253
mail.yandex 2441 2441
pikabu.ru 2130 2130
yandex.php 2057 2057
e.mail.yandex.ru 1971 1971
brandex.ru 1969 1969
bravoslava-230v 1942 1942
search 1933 1933
market.ru 1913 1913
mynet.ru 1881 1881
mail 1845 1845
mail.yandex.ua 1825 1825
mail.yandex.ua 1823 1823
rutube.com 1821 1821
images 1812 1812
news.rambler.com 1787 1787
hurpass.com.tr 1763 1763
ads.search 1742 1742
marina_2_sezon 1680 1680
cars.auto.ru 1628 1628
cian.ru 1620 1620
ivi.ru 1617 1617
av.by 1598 1598
world 1596 1596
news.yandex.ru 1495 1495
vk.com 1474 1474
pub 1469 1469
forum 1414 1414
wow-girls.ru 1399 1399
kinogo-dhpWXEdIcgoxWUZ6fgdTWw.. 1338 1338
uyelik.hurriyet.com.tr 1330 1330
aukro.ua 1314 1314
plugins 1244 1244
images.yandsearch 1235 1235
ondom.ru 1221 1221
korablitz.com 1189 1189
videovol-9-sezon 1187 1187
kerl.org 1155 1155
mail.yandex.php 1148 1148
file 1147 1147
love.mail.yandex.ru 1136 1136
yandex.kz 1124 1124
coccoc.com.tr 1113 1113
......@@ -77,24 +54,47 @@ sprashivai.ru 1072 1072
market.yandex.ru 1064 1064
spb-n.ru 1056 1056
sz.spaces.ru 1055 1055
xofx.net%2F63857&secret-oper=reply&id=0&extras] 1054 1054
marinance.ua 1050 1050
tube.ru 1044 1044
haber.com 1043 1043
image&img_url=http 1042 1042
sport 1040 1040
megogo.net 993 993
sozcu.com 991 991
yandex.by 938 938
image&uinfo 936 936
fast-golove.mail.ru_Mobile=0&at=35&text=производств 927 927
linka 901 901
gazeta.ru 892 892
yandex.ru;yandex.ru 892 892
kinogo-dhpWXEdIcgoxWUZ6fgdTXA.. 890 890
fotki.yandex.ru 875 875
fast-golove.mail.yandex.php 842 842
news=previews 839 839
faber 833 833
lenta.ru 820 820
publicdaroglundai_anketa.ru 813 813
mail.yandex.kz 810 810
censor.net 807 807
mail.yandex.by 804 804
nnn.ru 796 796
maxi.su 788 788
rambler.ru 755 755
hurpass.com.ua 729 729
g1.botva.lv 728 728
m.sport.airway 724 724
tvizle.com 723 723
fast-golove.mail.yandex.ru 712 712
spb.ru 693 693
eksisozluk.com 689 689
uyelik.hurriyet 666 666
rst.ua 650 650
deko.ru 647 647
my.mail.yandex.ru 647 647
astrov.pro 625 625
yandsearch.php 624 624
kinogo.net 617 617
fanati-avtomobile.jsp 611 611
tv.yandsearch 605 605
soft.ru 603 603
pluginplus.ru 601 601
images.yandex 595 595
1tv.rbc.ru 592 592
ria.ru 591 591
marina_prezideniz.hurriyet.com 578 578
youtube.ru 575 575
cars.autochno.ru 570 570
a2.stars.auto.yandsearch 566 566
love.mail.ru 560 560
mail.rambler.ru 553 553
582035 80248
ru 299420 71339
com 78253 34500
html 40288 19569
ua 33160 18847
tr 19570 13117
net 19003 12908
php 17817 12011
yandsearch 13598 10329
by 9349 7695
yandex 8946 7282
org 5897 5320
tv 5371 4660
kz 5175 4588
aspx 3084 2800
phtml 3012 2725
xml 2993 2726
tr&callback_url=http 2897 2681
su 2833 2587
shtml 2442 2218
hurriyet 2030 1907
search 1915 1904
tr&user 1556 1494
jpg 1531 1427
tr&users 1449 1373
tr&callback 1294 1244
jsp 1083 1048
net%2F63857&secret-oper=reply&id=0&extras] 1054 1054
htm 957 921
ru_Mobile=0&at=35&text=производств 927 927
lv 916 910
tr&user_page 916 885
exe 911 891
me 911 864
tr&user_page=http 900 868
do 864 838
tr&used 782 768
pro 778 772
ru 262914 69218
92101 89421
com 63298 30285
ua 29037 17475
html 25079 15039
tr 16770 11857
net 16387 11686
php 14374 10307
yandsearch 12024 9484
by 8192 6915
yandex 7211 6124
org 4890 4514
kz 4679 4211
tv 4400 3928
su 2602 2396
phtml 2409 2226
xml 2322 2182
aspx 1959 1848
search 1835 1827
hurriyet 1385 1345
shtml 995 966
lv 879 875
jsp 855 845
exe 814 798
pro 737 734
airway 724 724
biz 685 672
mail 677 660
info 593 575
tr&callback_url=https 534 526
tr%2Fgaleri 533 522
me 675 647
jpg 662 647
do 625 611
mail 593 581
biz 537 530
bstatistik_dlja-dlya-naches 521 521
sx 498 496
ru%2Fupload 497 492
news 492 487
hu 486 479
aspx&referer 473 459
pogoda 460 460
auto 438 429
az 434 425
net%2F63857&secret=506d9e3dfbd268e6b6630e58 432 432
info 461 453
pogoda 459 459
sx 450 449
news 448 444
sportlibrary 431 431
jpg,http 411 397
tr&callbusiness 410 407
fm 405 400
online 401 399
tr&callbusines 388 384
ru%2Fnews 387 382
hu 396 393
htm 393 385
fm 379 378
online 374 372
bstatistic 366 366
wbp 346 346
am 336 333
ru;yandsearch 330 328
tr&user_page=https 330 328
tr&callback_url 329 319
html&lang=ru&lr=110&category=dressages%2Fcs306755 328 328
pl 328 326
blog 327 326
jpg&pos 307 302
bstana 305 305
ru;yandex 287 284
im 283 278
diary 277 275
slando 276 274
eu 274 269
to 271 269
asp 253 250
html&lang 253 248
mynet 253 251
tj 242 241
sberbank 241 238
haber 234 227
jpg,https 232 232
cc 226 221
_2544 222 222
ws 221 219
mamba 220 220
auto 363 355
az 356 350
wbp 343 343
bstana 304 304
blog 268 268
diary 262 261
am 260 258
slando 254 252
im 238 235
eu 237 234
liveinteria 218 218
tr%2Fanasayfa 215 210
tr&user_pts=&states 213 213
yandsearchplus 212 211
jpg","photo 211 209
ru%2Fwww 211 211
com&callback_url=http 209 208
to 215 213
mamba 214 214
auto-supers 208 208
co 206 205
kg 206 205
ru%2Fuploads 206 205
sberbank 207 207
tj 205 205
bstatistik_dlja-dlya_avia 201 201
bstanii_otryasam 200 200
pl 200 198
wroad_5d 200 200
mynet 191 190
bstan 187 187
yandsearchplus 186 186
haber 184 179
jpg,https 184 184
turkasovki 183 183
co 177 177
video 177 177
gif","photos 175 175
mgshared_zone 172 172
wssp 172 172
jpg,http 170 168
swf 167 167
cc 166 164
ws 164 164
kg 157 156
mobili_s_probegom 154 153
cgi 153 152
yandsearcher 152 151
uz 150 150
nsf 149 149
adriver 147 144
slandsearch 143 142
korrez 140 140
bstatistik_dlja-dlja-putin 139 139
rambler 133 132
mvideo 132 132
asp 129 128
vc 127 127
md 121 121
jpg","photo 119 119
mp4 118 117
ee 116 115
loveplaceOfSearchplus 111 111
nl 111 111
bstatistika 107 107
br 102 102
sport 99 99
4508153 712428
auto.ru 576845 8935
yandex.ru 410788 111278
public 328528 23
313516 26015
public_search 311125 0
yandex.ru 410776 111278
korer.ru 277987 0
avito.ru 163820 15556
mail.yandex.ru 152469 1046
main=hurriyet.com 152096 259
wot 116912 6682
mail.yandex.ru 152447 1046
mail.ru 87949 22225
best.ru 58537 55
korablitz.ru 51844 0
hurpass.com 49671 1251
......@@ -2,8 +2,5 @@
0
0
0
http://игры на передачи пригорька россия&lr=213&rpt=simage&uinfo=ww-1905-wh-643-fw-112-rossiisoft.in.ua%2FKievav@yandex?appkey=506d9e3dfbd268e6b6630e58
http://игры на передачи пригорька россия&lr=213&rpt=simage&uinfo=ww-1905-wh-643-fw-112-rossiisoft.in.ua%2FKievav@yandex?appkey=506d9e3dfbd268e6b6630e58
http://ru slovari 15
https://ru spb.rabota 15
https://e yandex 12
https://povary_dlya-511-gemotedDynamo_accoshyutoy-s-kortosh@bk.ru/yandsearch?text=simages%2F8%2F10544998#posts%2Fkartofeleri
https://povary_dlya-511-gemotedDynamo_accoshyutoy-s-kortosh@bk.ru/yandsearch?text=simages%2F8%2F10544998#posts%2Fkartofeleri
......@@ -12,7 +12,7 @@ Returns the protocol. Examples: http, ftp, mailto, magnet...
### domain
Gets the domain.
Gets the domain. Cut scheme with size less than 16 bytes.
### domainWithoutWWW
......
......@@ -10,7 +10,7 @@
Возвращает протокол. Примеры: http, ftp, mailto, magnet...
### domain
Возвращает домен.
Возвращает домен. Отсекает схему размером не более 16 байт.
### domainWithoutWWW
Возвращает домен, удалив не более одного 'www.' с начала, если есть.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册