Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
2dot5
ClickHouse
提交
bedd5e7c
C
ClickHouse
项目概览
2dot5
/
ClickHouse
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
C
ClickHouse
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
bedd5e7c
编写于
4月 15, 2015
作者:
A
Andrey Mironov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
dbms: add function URLHash plus tests [#METR-15826]
上级
beddce09
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
273 addition
and
0 deletion
+273
-0
dbms/include/DB/Functions/FunctionsHashing.h
dbms/include/DB/Functions/FunctionsHashing.h
+211
-0
dbms/src/Functions/FunctionsHashing.cpp
dbms/src/Functions/FunctionsHashing.cpp
+1
-0
dbms/tests/queries/0_stateless/00149_function_url_hash.reference
...sts/queries/0_stateless/00149_function_url_hash.reference
+44
-0
dbms/tests/queries/0_stateless/00149_function_url_hash.sql
dbms/tests/queries/0_stateless/00149_function_url_hash.sql
+17
-0
未找到文件。
dbms/include/DB/Functions/FunctionsHashing.h
浏览文件 @
bedd5e7c
...
...
@@ -20,6 +20,8 @@
#include <DB/Common/HashTable/Hash.h>
#include <DB/Functions/IFunction.h>
#include <statdaemons/ext/range.hpp>
#include <stats/IntHash.h>
...
...
@@ -603,6 +605,215 @@ public:
};
struct
URLHashImpl
{
static
UInt64
apply
(
const
char
*
data
,
const
std
::
size_t
size
)
{
/// do not take last slash, '?' or '#' character into account
if
(
size
>
0
&&
(
data
[
size
-
1
]
==
'/'
||
data
[
size
-
1
]
==
'?'
||
data
[
size
-
1
]
==
'#'
))
return
CityHash64
(
data
,
size
-
1
);
return
CityHash64
(
data
,
size
);
}
};
struct
URLHierarchyHashImpl
{
static
std
::
size_t
findLevelLength
(
const
UInt64
level
,
const
char
*
begin
,
const
char
*
const
end
)
{
auto
pos
=
begin
;
/// Распарсим всё, что идёт до пути
/// Предположим, что протокол уже переведён в нижний регистр.
while
(
pos
<
end
&&
((
*
pos
>
'a'
&&
*
pos
<
'z'
)
||
(
*
pos
>
'0'
&&
*
pos
<
'9'
)))
++
pos
;
/** Будем вычислять иерархию только для URL-ов, в которых есть протокол, и после него идут два слеша.
* (http, file - подходят, mailto, magnet - не подходят), и после двух слешей ещё хоть что-нибудь есть
* Для остальных просто вернём полный URL как единственный элемент иерархии.
*/
if
(
pos
==
begin
||
pos
==
end
||
!
(
*
pos
++
==
':'
&&
pos
<
end
&&
*
pos
++
==
'/'
&&
pos
<
end
&&
*
pos
++
==
'/'
&&
pos
<
end
))
{
pos
=
end
;
return
0
==
level
?
pos
-
begin
:
0
;
}
/// Доменом для простоты будем считать всё, что после протокола и двух слешей, до следующего слеша или до ? или до #
while
(
pos
<
end
&&
!
(
*
pos
==
'/'
||
*
pos
==
'?'
||
*
pos
==
'#'
))
++
pos
;
if
(
pos
!=
end
)
++
pos
;
if
(
0
==
level
)
return
pos
-
begin
;
UInt64
current_level
=
0
;
while
(
current_level
!=
level
&&
pos
<
end
)
{
/// Идём до следующего / или ? или #, пропуская все те, что вначале.
while
(
pos
<
end
&&
(
*
pos
==
'/'
||
*
pos
==
'?'
||
*
pos
==
'#'
))
++
pos
;
if
(
pos
==
end
)
break
;
while
(
pos
<
end
&&
!
(
*
pos
==
'/'
||
*
pos
==
'?'
||
*
pos
==
'#'
))
++
pos
;
if
(
pos
!=
end
)
++
pos
;
++
current_level
;
}
return
current_level
==
level
?
pos
-
begin
:
0
;
}
static
UInt64
apply
(
const
UInt64
level
,
const
char
*
data
,
const
std
::
size_t
size
)
{
return
URLHashImpl
::
apply
(
data
,
findLevelLength
(
level
,
data
,
data
+
size
));
}
};
class
FunctionURLHash
:
public
IFunction
{
public:
static
constexpr
auto
name
=
"URLHash"
;
static
IFunction
*
create
(
const
Context
&
)
{
return
new
FunctionURLHash
;
}
String
getName
()
const
override
{
return
name
;
}
DataTypePtr
getReturnType
(
const
DataTypes
&
arguments
)
const
override
{
const
auto
arg_count
=
arguments
.
size
();
if
(
arg_count
!=
1
&&
arg_count
!=
2
)
throw
Exception
{
"Number of arguments for function "
+
getName
()
+
" doesn't match: passed "
+
toString
(
arg_count
)
+
", should be 1 or 2."
,
ErrorCodes
::
NUMBER_OF_ARGUMENTS_DOESNT_MATCH
};
const
auto
first_arg
=
arguments
.
front
().
get
();
if
(
!
typeid_cast
<
const
DataTypeString
*>
(
first_arg
))
throw
Exception
{
"Illegal type "
+
first_arg
->
getName
()
+
" of argument of function "
+
getName
(),
ErrorCodes
::
ILLEGAL_TYPE_OF_ARGUMENT
};
if
(
arg_count
==
2
)
{
const
auto
second_arg
=
arguments
.
back
().
get
();
if
(
!
typeid_cast
<
const
DataTypeUInt8
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeUInt16
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeUInt32
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeUInt64
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeInt8
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeInt16
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeInt32
*>
(
second_arg
)
&&
!
typeid_cast
<
const
DataTypeInt64
*>
(
second_arg
))
throw
Exception
{
"Illegal type "
+
second_arg
->
getName
()
+
" of argument of function "
+
getName
(),
ErrorCodes
::
ILLEGAL_TYPE_OF_ARGUMENT
};
}
return
new
DataTypeUInt64
;
}
void
execute
(
Block
&
block
,
const
ColumnNumbers
&
arguments
,
const
size_t
result
)
{
const
auto
arg_count
=
arguments
.
size
();
if
(
arg_count
==
1
)
executeSingleArg
(
block
,
arguments
,
result
);
else
if
(
arg_count
==
2
)
executeTwoArgs
(
block
,
arguments
,
result
);
else
throw
std
::
logic_error
{
"got into IFunction::execute with unexpected number of arguments"
};
}
private:
void
executeSingleArg
(
Block
&
block
,
const
ColumnNumbers
&
arguments
,
const
std
::
size_t
result
)
const
{
const
auto
col_untyped
=
block
.
getByPosition
(
arguments
.
front
()).
column
.
get
();
if
(
const
auto
col_from
=
typeid_cast
<
const
ColumnString
*>
(
col_untyped
))
{
const
auto
size
=
col_from
->
size
();
const
auto
col_to
=
new
ColumnVector
<
UInt64
>
{
size
};
block
.
getByPosition
(
result
).
column
=
col_to
;
const
auto
&
chars
=
col_from
->
getChars
();
const
auto
&
offsets
=
col_from
->
getOffsets
();
auto
&
out
=
col_to
->
getData
();
for
(
const
auto
i
:
ext
::
range
(
0
,
size
))
out
[
i
]
=
URLHashImpl
::
apply
(
reinterpret_cast
<
const
char
*>
(
&
chars
[
i
==
0
?
0
:
offsets
[
i
-
1
]]),
i
==
0
?
offsets
[
i
]
-
1
:
(
offsets
[
i
]
-
1
-
offsets
[
i
-
1
]));
}
else
if
(
const
auto
col_from
=
typeid_cast
<
const
ColumnConstString
*>
(
col_untyped
))
{
block
.
getByPosition
(
result
).
column
=
new
ColumnConstUInt64
{
col_from
->
size
(),
URLHashImpl
::
apply
(
col_from
->
getData
().
data
(),
col_from
->
getData
().
size
())
};
}
else
throw
Exception
{
"Illegal column "
+
block
.
getByPosition
(
arguments
[
0
]).
column
->
getName
()
+
" of argument of function "
+
getName
(),
ErrorCodes
::
ILLEGAL_COLUMN
};
}
void
executeTwoArgs
(
Block
&
block
,
const
ColumnNumbers
&
arguments
,
const
std
::
size_t
result
)
const
{
const
auto
level_col
=
block
.
getByPosition
(
arguments
.
back
()).
column
.
get
();
if
(
!
level_col
->
isConst
())
throw
Exception
{
"Second argument of function "
+
getName
()
+
" must be an integral constant"
,
ErrorCodes
::
ILLEGAL_COLUMN
};
const
auto
level
=
level_col
->
get64
(
0
);
const
auto
col_untyped
=
block
.
getByPosition
(
arguments
.
front
()).
column
.
get
();
if
(
const
auto
col_from
=
typeid_cast
<
const
ColumnString
*>
(
col_untyped
))
{
const
auto
size
=
col_from
->
size
();
const
auto
col_to
=
new
ColumnVector
<
UInt64
>
{
size
};
block
.
getByPosition
(
result
).
column
=
col_to
;
const
auto
&
chars
=
col_from
->
getChars
();
const
auto
&
offsets
=
col_from
->
getOffsets
();
auto
&
out
=
col_to
->
getData
();
for
(
const
auto
i
:
ext
::
range
(
0
,
size
))
out
[
i
]
=
URLHierarchyHashImpl
::
apply
(
level
,
reinterpret_cast
<
const
char
*>
(
&
chars
[
i
==
0
?
0
:
offsets
[
i
-
1
]]),
i
==
0
?
offsets
[
i
]
-
1
:
(
offsets
[
i
]
-
1
-
offsets
[
i
-
1
]));
}
else
if
(
const
auto
col_from
=
typeid_cast
<
const
ColumnConstString
*>
(
col_untyped
))
{
block
.
getByPosition
(
result
).
column
=
new
ColumnConstUInt64
{
col_from
->
size
(),
URLHierarchyHashImpl
::
apply
(
level
,
col_from
->
getData
().
data
(),
col_from
->
getData
().
size
())
};
}
else
throw
Exception
{
"Illegal column "
+
block
.
getByPosition
(
arguments
[
0
]).
column
->
getName
()
+
" of argument of function "
+
getName
(),
ErrorCodes
::
ILLEGAL_COLUMN
};
}
};
struct
NameHalfMD5
{
static
constexpr
auto
name
=
"halfMD5"
;
};
struct
NameSipHash64
{
static
constexpr
auto
name
=
"sipHash64"
;
};
struct
NameCityHash64
{
static
constexpr
auto
name
=
"cityHash64"
;
};
...
...
dbms/src/Functions/FunctionsHashing.cpp
浏览文件 @
bedd5e7c
...
...
@@ -17,6 +17,7 @@ void registerFunctionsHashing(FunctionFactory & factory)
factory
.
registerFunction
<
FunctionCityHash64
>
();
factory
.
registerFunction
<
FunctionIntHash32
>
();
factory
.
registerFunction
<
FunctionIntHash64
>
();
factory
.
registerFunction
<
FunctionURLHash
>
();
}
}
dbms/tests/queries/0_stateless/00149_function_url_hash.reference
0 → 100644
浏览文件 @
bedd5e7c
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
dbms/tests/queries/0_stateless/00149_function_url_hash.sql
0 → 100644
浏览文件 @
bedd5e7c
select
URLHash
(
''
as
url
)
=
URLHash
(
appendTrailingCharIfAbsent
(
url
,
'/'
));
select
URLHash
(
'http://ya.ru'
as
url
)
=
URLHash
(
appendTrailingCharIfAbsent
(
url
,
'/'
));
select
URLHash
(
'http://ya.ru'
as
url
)
=
URLHash
(
appendTrailingCharIfAbsent
(
url
,
'?'
));
select
URLHash
(
'http://ya.ru'
as
url
)
=
URLHash
(
appendTrailingCharIfAbsent
(
url
,
'#'
));
select
URLHash
(
''
as
url
,
0
)
=
URLHash
(
url
);
select
URLHash
(
''
as
url
,
1
)
=
URLHash
(
url
);
select
URLHash
(
''
as
url
,
1000
)
=
URLHash
(
url
);
select
URLHash
(
'http://ya.ru/a'
as
url
,
0
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
]);
select
URLHash
(
'http://ya.ru/a'
as
url
,
1
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
]);
select
URLHash
(
url
,
0
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
])
array
join
[
''
,
'http://ya.ru'
,
'http://ya.ru/'
,
'http://ya.ru/a'
,
'http://ya.ru/a/'
,
'http://ya.ru/a/b'
,
'http://ya.ru/a/b?'
]
as
url
;
select
URLHash
(
url
,
1
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
])
array
join
[
''
,
'http://ya.ru'
,
'http://ya.ru/'
,
'http://ya.ru/a'
,
'http://ya.ru/a/'
,
'http://ya.ru/a/b'
,
'http://ya.ru/a/b?'
]
as
url
;
select
URLHash
(
url
,
2
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
])
array
join
[
''
,
'http://ya.ru'
,
'http://ya.ru/'
,
'http://ya.ru/a'
,
'http://ya.ru/a/'
,
'http://ya.ru/a/b'
,
'http://ya.ru/a/b?'
]
as
url
;
select
URLHash
(
url
,
3
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
])
array
join
[
''
,
'http://ya.ru'
,
'http://ya.ru/'
,
'http://ya.ru/a'
,
'http://ya.ru/a/'
,
'http://ya.ru/a/b'
,
'http://ya.ru/a/b?'
]
as
url
;
select
URLHash
(
url
,
4
as
level
)
=
URLHash
(
URLHierarchy
(
url
)[
level
+
1
])
array
join
[
''
,
'http://ya.ru'
,
'http://ya.ru/'
,
'http://ya.ru/a'
,
'http://ya.ru/a/'
,
'http://ya.ru/a/b'
,
'http://ya.ru/a/b?'
]
as
url
;
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录