Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
2dot5
ClickHouse
提交
4fd85b41
C
ClickHouse
项目概览
2dot5
/
ClickHouse
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
C
ClickHouse
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
4fd85b41
编写于
6月 12, 2019
作者:
D
Danila Kutenin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Clearer interfaces of Searchers
上级
bb5239f1
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
111 addition
and
137 deletion
+111
-137
dbms/src/Common/StringSearcher.h
dbms/src/Common/StringSearcher.h
+10
-14
dbms/src/Common/Volnitsky.h
dbms/src/Common/Volnitsky.h
+42
-119
dbms/src/Functions/FunctionsStringSearch.cpp
dbms/src/Functions/FunctionsStringSearch.cpp
+59
-4
未找到文件。
dbms/src/Common/StringSearcher.h
浏览文件 @
4fd85b41
...
...
@@ -329,8 +329,7 @@ class StringSearcher<false, true> : private StringSearcherBase
private:
/// string to be searched for
const
UInt8
*
const
needle
;
const
size_t
needle_size
;
const
UInt8
*
const
needle_end
=
needle
+
needle_size
;
const
UInt8
*
const
needle_end
;
/// lower and uppercase variants of the first character in `needle`
UInt8
l
{};
UInt8
u
{};
...
...
@@ -345,7 +344,7 @@ private:
public:
StringSearcher
(
const
char
*
const
needle_
,
const
size_t
needle_size
)
:
needle
{
reinterpret_cast
<
const
UInt8
*>
(
needle_
)},
needle_
size
{
needle_size
}
:
needle
{
reinterpret_cast
<
const
UInt8
*>
(
needle_
)},
needle_
end
{
needle
+
needle_size
}
{
if
(
0
==
needle_size
)
return
;
...
...
@@ -430,7 +429,7 @@ public:
const
UInt8
*
search
(
const
UInt8
*
haystack
,
const
UInt8
*
const
haystack_end
)
const
{
if
(
0
==
needle_size
)
if
(
needle
==
needle_end
)
return
haystack
;
while
(
haystack
<
haystack_end
)
...
...
@@ -528,8 +527,7 @@ class StringSearcher<true, ASCII> : private StringSearcherBase
private:
/// string to be searched for
const
UInt8
*
const
needle
;
const
size_t
needle_size
;
const
UInt8
*
const
needle_end
=
needle
+
needle_size
;
const
UInt8
*
const
needle_end
;
/// first character in `needle`
UInt8
first
{};
...
...
@@ -543,7 +541,7 @@ private:
public:
StringSearcher
(
const
char
*
const
needle_
,
const
size_t
needle_size
)
:
needle
{
reinterpret_cast
<
const
UInt8
*>
(
needle_
)},
needle_
size
{
needle_size
}
:
needle
{
reinterpret_cast
<
const
UInt8
*>
(
needle_
)},
needle_
end
{
needle
+
needle_size
}
{
if
(
0
==
needle_size
)
return
;
...
...
@@ -616,7 +614,7 @@ public:
const
UInt8
*
search
(
const
UInt8
*
haystack
,
const
UInt8
*
const
haystack_end
)
const
{
if
(
0
==
needle_size
)
if
(
needle
==
needle_end
)
return
haystack
;
while
(
haystack
<
haystack_end
)
...
...
@@ -715,10 +713,9 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
struct
LibCASCIICaseSensitiveStringSearcher
{
const
char
*
const
needle
;
const
size_t
needle_size
;
LibCASCIICaseSensitiveStringSearcher
(
const
char
*
const
needle
,
const
size_t
needle_size
)
:
needle
(
needle
)
,
needle_size
(
needle_size
)
{}
LibCASCIICaseSensitiveStringSearcher
(
const
char
*
const
needle
,
const
size_t
/* needle_size */
)
:
needle
(
needle
)
{}
const
UInt8
*
search
(
const
UInt8
*
haystack
,
const
UInt8
*
const
haystack_end
)
const
{
...
...
@@ -737,10 +734,9 @@ struct LibCASCIICaseSensitiveStringSearcher
struct
LibCASCIICaseInsensitiveStringSearcher
{
const
char
*
const
needle
;
const
size_t
needle_size
;
LibCASCIICaseInsensitiveStringSearcher
(
const
char
*
const
needle
,
const
size_t
needle_size
)
:
needle
(
needle
)
,
needle_size
(
needle_size
)
{}
LibCASCIICaseInsensitiveStringSearcher
(
const
char
*
const
needle
,
const
size_t
/* needle_size */
)
:
needle
(
needle
)
{}
const
UInt8
*
search
(
const
UInt8
*
haystack
,
const
UInt8
*
const
haystack_end
)
const
{
...
...
dbms/src/Common/Volnitsky.h
浏览文件 @
4fd85b41
...
...
@@ -4,7 +4,6 @@
#include <vector>
#include <stdint.h>
#include <string.h>
#include <Columns/ColumnString.h>
#include <Core/Types.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h>
...
...
@@ -345,6 +344,7 @@ public:
auto
callback
=
[
this
](
const
VolnitskyTraits
::
Ngram
ngram
,
const
int
offset
)
{
return
this
->
putNGramBase
(
ngram
,
offset
);
};
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
/// And also adding from the end guarantees that we will find first occurence because we will lookup bigger offsets first.
for
(
auto
i
=
static_cast
<
ssize_t
>
(
needle_size
-
sizeof
(
VolnitskyTraits
::
Ngram
));
i
>=
0
;
--
i
)
VolnitskyTraits
::
putNGram
<
CaseSensitive
,
ASCII
>
(
this
->
needle
+
i
,
i
+
1
,
this
->
needle
,
callback
);
}
...
...
@@ -436,94 +436,6 @@ public:
fallback_searchers
.
reserve
(
needles
.
size
());
}
template
<
typename
ResultType
,
typename
AnsCallback
>
void
searchAllPositions
(
const
ColumnString
::
Chars
&
haystack_data
,
const
ColumnString
::
Offsets
&
haystack_offsets
,
const
AnsCallback
&
ans_callback
,
ResultType
&
ans
)
{
const
size_t
haystack_string_size
=
haystack_offsets
.
size
();
const
size_t
needles_size
=
needles
.
size
();
/// something can be uninitialized after
std
::
fill
(
ans
.
begin
(),
ans
.
end
(),
0
);
while
(
!
reset
())
{
size_t
fallback_size
=
fallback_needles
.
size
();
size_t
prev_offset
=
0
;
for
(
size_t
j
=
0
,
from
=
0
;
j
<
haystack_string_size
;
++
j
,
from
+=
needles_size
)
{
const
auto
*
haystack
=
&
haystack_data
[
prev_offset
];
const
auto
*
haystack_end
=
haystack
+
haystack_offsets
[
j
]
-
prev_offset
-
1
;
for
(
size_t
i
=
0
;
i
<
fallback_size
;
++
i
)
{
const
UInt8
*
ptr
=
fallback_searchers
[
fallback_needles
[
i
]].
search
(
haystack
,
haystack_end
);
if
(
ptr
!=
haystack_end
)
ans
[
from
+
fallback_needles
[
i
]]
=
ans_callback
(
haystack
,
ptr
);
}
/// check if we have one non empty volnitsky searcher
if
(
step
!=
std
::
numeric_limits
<
size_t
>::
max
())
{
const
auto
*
pos
=
haystack
+
step
-
sizeof
(
VolnitskyTraits
::
Ngram
);
for
(;
pos
<=
haystack_end
-
sizeof
(
VolnitskyTraits
::
Ngram
);
pos
+=
step
)
{
for
(
size_t
cell_num
=
VolnitskyTraits
::
toNGram
(
pos
)
%
VolnitskyTraits
::
hash_size
;
hash
[
cell_num
].
off
;
cell_num
=
(
cell_num
+
1
)
%
VolnitskyTraits
::
hash_size
)
{
if
(
pos
>=
haystack
+
hash
[
cell_num
].
off
-
1
)
{
const
auto
*
res
=
pos
-
(
hash
[
cell_num
].
off
-
1
);
const
size_t
ind
=
hash
[
cell_num
].
id
;
if
(
ans
[
from
+
ind
]
==
0
&&
res
+
needles
[
ind
].
size
<=
haystack_end
)
{
if
(
fallback_searchers
[
ind
].
compare
(
res
))
{
ans
[
from
+
ind
]
=
ans_callback
(
haystack
,
res
);
}
}
}
}
}
}
prev_offset
=
haystack_offsets
[
j
];
}
}
}
template
<
typename
ResultType
>
void
search
(
const
ColumnString
::
Chars
&
haystack_data
,
const
ColumnString
::
Offsets
&
haystack_offsets
,
ResultType
&
ans
)
{
auto
callback
=
[
this
](
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
)
->
bool
{
return
this
->
searchOne
(
haystack
,
haystack_end
);
};
searchInternal
(
haystack_data
,
haystack_offsets
,
callback
,
ans
);
}
template
<
typename
ResultType
>
void
searchIndex
(
const
ColumnString
::
Chars
&
haystack_data
,
const
ColumnString
::
Offsets
&
haystack_offsets
,
ResultType
&
ans
)
{
auto
callback
=
[
this
](
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
)
->
size_t
{
return
this
->
searchOneIndex
(
haystack
,
haystack_end
);
};
searchInternal
(
haystack_data
,
haystack_offsets
,
callback
,
ans
);
}
template
<
typename
ResultType
,
typename
CountCharsCallback
>
void
searchFirstPosition
(
const
ColumnString
::
Chars
&
haystack_data
,
const
ColumnString
::
Offsets
&
haystack_offsets
,
const
CountCharsCallback
&
count_chars_callback
,
ResultType
&
ans
)
{
auto
callback
=
[
this
,
&
count_chars_callback
](
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
)
->
UInt64
{
return
this
->
searchOneFirstPosition
(
haystack
,
haystack_end
,
count_chars_callback
);
};
searchInternal
(
haystack_data
,
haystack_offsets
,
callback
,
ans
);
}
private:
/**
* This function is needed to initialize hash table
* Returns `true` if there is nothing to initialize
...
...
@@ -532,15 +444,15 @@ private:
* We actually destroy the hash table and initialize it with uninitialized needles
* and search through the haystack again.
* The actual usage of this function is like this:
* while (
!reset
())
* while (
hasMoreToSearch
())
* {
* search inside the haystack with the known needles
* }
*/
bool
reset
()
bool
hasMoreToSearch
()
{
if
(
last
==
needles
.
size
())
return
tru
e
;
return
fals
e
;
memset
(
hash
,
0
,
sizeof
(
hash
));
fallback_needles
.
clear
();
...
...
@@ -585,28 +497,7 @@ private:
}
fallback_searchers
.
emplace_back
(
cur_needle_data
,
cur_needle_size
);
}
return
false
;
}
template
<
typename
OneSearcher
,
typename
ResultType
>
inline
void
searchInternal
(
const
ColumnString
::
Chars
&
haystack_data
,
const
ColumnString
::
Offsets
&
haystack_offsets
,
const
OneSearcher
&
search_fallback
,
ResultType
&
ans
)
{
const
size_t
haystack_string_size
=
haystack_offsets
.
size
();
while
(
!
reset
())
{
size_t
prev_offset
=
0
;
for
(
size_t
j
=
0
;
j
<
haystack_string_size
;
++
j
)
{
const
auto
*
haystack
=
&
haystack_data
[
prev_offset
];
const
auto
*
haystack_end
=
haystack
+
haystack_offsets
[
j
]
-
prev_offset
-
1
;
ans
[
j
]
=
search_fallback
(
haystack
,
haystack_end
);
prev_offset
=
haystack_offsets
[
j
];
}
}
return
true
;
}
inline
bool
searchOne
(
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
)
const
...
...
@@ -638,7 +529,7 @@ private:
return
false
;
}
inline
size_t
searchOneIndex
(
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
)
const
inline
size_t
searchOne
First
Index
(
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
)
const
{
const
size_t
fallback_size
=
fallback_needles
.
size
();
...
...
@@ -676,7 +567,7 @@ private:
}
template
<
typename
CountCharsCallback
>
inline
UInt64
searchOneFirstPosition
(
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
,
const
CountCharsCallback
&
c
allback
)
const
inline
UInt64
searchOneFirstPosition
(
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
,
const
CountCharsCallback
&
c
ount_chars
)
const
{
const
size_t
fallback_size
=
fallback_needles
.
size
();
...
...
@@ -684,7 +575,7 @@ private:
for
(
size_t
i
=
0
;
i
<
fallback_size
;
++
i
)
if
(
auto
pos
=
fallback_searchers
[
fallback_needles
[
i
]].
search
(
haystack
,
haystack_end
);
pos
!=
haystack_end
)
ans
=
std
::
min
(
ans
,
callback
(
haystack
,
pos
)
);
ans
=
std
::
min
<
UInt64
>
(
ans
,
pos
-
haystack
);
/// check if we have one non empty volnitsky searcher
if
(
step
!=
std
::
numeric_limits
<
size_t
>::
max
())
...
...
@@ -700,14 +591,46 @@ private:
const
auto
res
=
pos
-
(
hash
[
cell_num
].
off
-
1
);
const
size_t
ind
=
hash
[
cell_num
].
id
;
if
(
res
+
needles
[
ind
].
size
<=
haystack_end
&&
fallback_searchers
[
ind
].
compare
(
res
))
ans
=
std
::
min
(
ans
,
callback
(
haystack
,
res
)
);
ans
=
std
::
min
<
UInt64
>
(
ans
,
res
-
haystack
);
}
}
}
}
if
(
ans
==
std
::
numeric_limits
<
UInt64
>::
max
())
return
0
;
return
ans
;
return
count_chars
(
haystack
,
haystack
+
ans
);
}
template
<
typename
CountCharsCallback
,
typename
AnsType
>
inline
void
searchOneAll
(
const
UInt8
*
haystack
,
const
UInt8
*
haystack_end
,
AnsType
*
ans
,
const
CountCharsCallback
&
count_chars
)
const
{
const
size_t
fallback_size
=
fallback_needles
.
size
();
for
(
size_t
i
=
0
;
i
<
fallback_size
;
++
i
)
{
const
UInt8
*
ptr
=
fallback_searchers
[
fallback_needles
[
i
]].
search
(
haystack
,
haystack_end
);
if
(
ptr
!=
haystack_end
)
ans
[
fallback_needles
[
i
]]
=
count_chars
(
haystack
,
ptr
);
}
/// check if we have one non empty volnitsky searcher
if
(
step
!=
std
::
numeric_limits
<
size_t
>::
max
())
{
const
auto
*
pos
=
haystack
+
step
-
sizeof
(
VolnitskyTraits
::
Ngram
);
for
(;
pos
<=
haystack_end
-
sizeof
(
VolnitskyTraits
::
Ngram
);
pos
+=
step
)
{
for
(
size_t
cell_num
=
VolnitskyTraits
::
toNGram
(
pos
)
%
VolnitskyTraits
::
hash_size
;
hash
[
cell_num
].
off
;
cell_num
=
(
cell_num
+
1
)
%
VolnitskyTraits
::
hash_size
)
{
if
(
pos
>=
haystack
+
hash
[
cell_num
].
off
-
1
)
{
const
auto
*
res
=
pos
-
(
hash
[
cell_num
].
off
-
1
);
const
size_t
ind
=
hash
[
cell_num
].
id
;
if
(
ans
[
ind
]
==
0
&&
res
+
needles
[
ind
].
size
<=
haystack_end
&&
fallback_searchers
[
ind
].
compare
(
res
))
ans
[
ind
]
=
count_chars
(
haystack
,
res
);
}
}
}
}
}
void
putNGramBase
(
const
VolnitskyTraits
::
Ngram
ngram
,
const
int
offset
,
const
size_t
num
)
...
...
dbms/src/Functions/FunctionsStringSearch.cpp
浏览文件 @
4fd85b41
...
...
@@ -307,7 +307,26 @@ struct MultiSearchAllPositionsImpl
{
return
1
+
Impl
::
countChars
(
reinterpret_cast
<
const
char
*>
(
start
),
reinterpret_cast
<
const
char
*>
(
end
));
};
Impl
::
createMultiSearcherInBigHaystack
(
needles
).
searchAllPositions
(
haystack_data
,
haystack_offsets
,
res_callback
,
res
);
auto
searcher
=
Impl
::
createMultiSearcherInBigHaystack
(
needles
);
const
size_t
haystack_string_size
=
haystack_offsets
.
size
();
const
size_t
needles_size
=
needles
.
size
();
/// Something can be uninitialized after the search itself
std
::
fill
(
res
.
begin
(),
res
.
end
(),
0
);
while
(
searcher
.
hasMoreToSearch
())
{
size_t
prev_offset
=
0
;
for
(
size_t
j
=
0
,
from
=
0
;
j
<
haystack_string_size
;
++
j
,
from
+=
needles_size
)
{
const
auto
*
haystack
=
&
haystack_data
[
prev_offset
];
const
auto
*
haystack_end
=
haystack
+
haystack_offsets
[
j
]
-
prev_offset
-
1
;
searcher
.
searchOneAll
(
haystack
,
haystack_end
,
res
.
data
()
+
from
,
res_callback
);
prev_offset
=
haystack_offsets
[
j
];
}
}
}
};
...
...
@@ -323,7 +342,19 @@ struct MultiSearchImpl
const
std
::
vector
<
StringRef
>
&
needles
,
PaddedPODArray
<
UInt8
>
&
res
)
{
Impl
::
createMultiSearcherInBigHaystack
(
needles
).
search
(
haystack_data
,
haystack_offsets
,
res
);
auto
searcher
=
Impl
::
createMultiSearcherInBigHaystack
(
needles
);
const
size_t
haystack_string_size
=
haystack_offsets
.
size
();
while
(
searcher
.
hasMoreToSearch
())
{
size_t
prev_offset
=
0
;
for
(
size_t
j
=
0
;
j
<
haystack_string_size
;
++
j
)
{
const
auto
*
haystack
=
&
haystack_data
[
prev_offset
];
const
auto
*
haystack_end
=
haystack
+
haystack_offsets
[
j
]
-
prev_offset
-
1
;
res
[
j
]
=
searcher
.
searchOne
(
haystack
,
haystack_end
);
prev_offset
=
haystack_offsets
[
j
];
}
}
}
};
...
...
@@ -343,7 +374,19 @@ struct MultiSearchFirstPositionImpl
{
return
1
+
Impl
::
countChars
(
reinterpret_cast
<
const
char
*>
(
start
),
reinterpret_cast
<
const
char
*>
(
end
));
};
Impl
::
createMultiSearcherInBigHaystack
(
needles
).
searchFirstPosition
(
haystack_data
,
haystack_offsets
,
res_callback
,
res
);
auto
searcher
=
Impl
::
createMultiSearcherInBigHaystack
(
needles
);
const
size_t
haystack_string_size
=
haystack_offsets
.
size
();
while
(
searcher
.
hasMoreToSearch
())
{
size_t
prev_offset
=
0
;
for
(
size_t
j
=
0
;
j
<
haystack_string_size
;
++
j
)
{
const
auto
*
haystack
=
&
haystack_data
[
prev_offset
];
const
auto
*
haystack_end
=
haystack
+
haystack_offsets
[
j
]
-
prev_offset
-
1
;
res
[
j
]
=
searcher
.
searchOneFirstPosition
(
haystack
,
haystack_end
,
res_callback
);
prev_offset
=
haystack_offsets
[
j
];
}
}
}
};
...
...
@@ -359,7 +402,19 @@ struct MultiSearchFirstIndexImpl
const
std
::
vector
<
StringRef
>
&
needles
,
PaddedPODArray
<
UInt64
>
&
res
)
{
Impl
::
createMultiSearcherInBigHaystack
(
needles
).
searchIndex
(
haystack_data
,
haystack_offsets
,
res
);
auto
searcher
=
Impl
::
createMultiSearcherInBigHaystack
(
needles
);
const
size_t
haystack_string_size
=
haystack_offsets
.
size
();
while
(
searcher
.
hasMoreToSearch
())
{
size_t
prev_offset
=
0
;
for
(
size_t
j
=
0
;
j
<
haystack_string_size
;
++
j
)
{
const
auto
*
haystack
=
&
haystack_data
[
prev_offset
];
const
auto
*
haystack_end
=
haystack
+
haystack_offsets
[
j
]
-
prev_offset
-
1
;
res
[
j
]
=
searcher
.
searchOneFirstIndex
(
haystack
,
haystack_end
);
prev_offset
=
haystack_offsets
[
j
];
}
}
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录