Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
2dot5
ClickHouse
提交
6b39d248
C
ClickHouse
项目概览
2dot5
/
ClickHouse
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
C
ClickHouse
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
6b39d248
编写于
10月 04, 2020
作者:
A
alexey-milovidov
提交者:
GitHub
10月 04, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #15542 from Avogar/quantile_t_digest
Improve quantileTDigest performance
上级
53ce5e38
7acd85d2
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
115 addition
and
43 deletion
+115
-43
src/AggregateFunctions/QuantileTDigest.h
src/AggregateFunctions/QuantileTDigest.h
+115
-43
未找到文件。
src/AggregateFunctions/QuantileTDigest.h
浏览文件 @
6b39d248
...
...
@@ -36,7 +36,7 @@ namespace ErrorCodes
* uses asin, which slows down the algorithm a bit.
*/
template
<
typename
T
>
class
Quantile
TDigest
class
TDigest
{
using
Value
=
Float32
;
using
Count
=
Float32
;
...
...
@@ -86,20 +86,12 @@ class QuantileTDigest
/// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.
static
constexpr
size_t
bytes_in_arena
=
128
-
sizeof
(
PODArray
<
Centroid
>
)
-
sizeof
(
Count
)
-
sizeof
(
UInt32
);
using
Summary
=
PODArrayWithStackMemory
<
Centroid
,
bytes_in_arena
>
;
using
Centroids
=
PODArrayWithStackMemory
<
Centroid
,
bytes_in_arena
>
;
Summary
summary
;
Centroids
centroids
;
Count
count
=
0
;
UInt32
unmerged
=
0
;
/** Linear interpolation at the point x on the line (x1, y1)..(x2, y2)
*/
static
Value
interpolate
(
Value
x
,
Value
x1
,
Value
y1
,
Value
x2
,
Value
y2
)
{
double
k
=
(
x
-
x1
)
/
(
x2
-
x1
);
return
y1
+
k
*
(
y2
-
y1
);
}
struct
RadixSortTraits
{
using
Element
=
Centroid
;
...
...
@@ -122,13 +114,14 @@ class QuantileTDigest
*/
void
addCentroid
(
const
Centroid
&
c
)
{
summary
.
push_back
(
c
);
centroids
.
push_back
(
c
);
count
+=
c
.
count
;
++
unmerged
;
if
(
unmerged
>=
params
.
max_unmerged
)
compress
();
}
public:
/** Performs compression of accumulated centroids
* When merging, the invariant is retained to the maximum size of each
* centroid that does not exceed `4 q (1 - q) \ delta N`.
...
...
@@ -137,16 +130,16 @@ class QuantileTDigest
{
if
(
unmerged
>
0
)
{
RadixSort
<
RadixSortTraits
>::
executeLSD
(
summary
.
data
(),
summary
.
size
());
RadixSort
<
RadixSortTraits
>::
executeLSD
(
centroids
.
data
(),
centroids
.
size
());
if
(
summary
.
size
()
>
3
)
if
(
centroids
.
size
()
>
3
)
{
/// A pair of consecutive bars of the histogram.
auto
l
=
summary
.
begin
();
auto
l
=
centroids
.
begin
();
auto
r
=
std
::
next
(
l
);
Count
sum
=
0
;
while
(
r
!=
summary
.
end
())
while
(
r
!=
centroids
.
end
())
{
// we use quantile which gives us the smallest error
...
...
@@ -188,14 +181,13 @@ class QuantileTDigest
}
/// At the end of the loop, all values to the right of l were "eaten".
summary
.
resize
(
l
-
summary
.
begin
()
+
1
);
centroids
.
resize
(
l
-
centroids
.
begin
()
+
1
);
}
unmerged
=
0
;
}
}
public:
/** Adds to the digest a change in `x` with a weight of `cnt` (default 1)
*/
void
add
(
T
x
,
UInt64
cnt
=
1
)
...
...
@@ -203,17 +195,17 @@ public:
addCentroid
(
Centroid
(
Value
(
x
),
Count
(
cnt
)));
}
void
merge
(
const
Quantile
TDigest
&
other
)
void
merge
(
const
TDigest
&
other
)
{
for
(
const
auto
&
c
:
other
.
summary
)
for
(
const
auto
&
c
:
other
.
centroids
)
addCentroid
(
c
);
}
void
serialize
(
WriteBuffer
&
buf
)
{
compress
();
writeVarUInt
(
summary
.
size
(),
buf
);
buf
.
write
(
reinterpret_cast
<
const
char
*>
(
summary
.
data
()),
summary
.
size
()
*
sizeof
(
summary
[
0
]));
writeVarUInt
(
centroids
.
size
(),
buf
);
buf
.
write
(
reinterpret_cast
<
const
char
*>
(
centroids
.
data
()),
centroids
.
size
()
*
sizeof
(
centroids
[
0
]));
}
void
deserialize
(
ReadBuffer
&
buf
)
...
...
@@ -222,36 +214,113 @@ public:
readVarUInt
(
size
,
buf
);
if
(
size
>
params
.
max_unmerged
)
throw
Exception
(
"Too large t-digest
summary
size"
,
ErrorCodes
::
TOO_LARGE_ARRAY_SIZE
);
throw
Exception
(
"Too large t-digest
centroids
size"
,
ErrorCodes
::
TOO_LARGE_ARRAY_SIZE
);
summary
.
resize
(
size
);
buf
.
read
(
reinterpret_cast
<
char
*>
(
summary
.
data
()),
size
*
sizeof
(
summary
[
0
]));
centroids
.
resize
(
size
);
buf
.
read
(
reinterpret_cast
<
char
*>
(
centroids
.
data
()),
size
*
sizeof
(
centroids
[
0
]));
count
=
0
;
for
(
const
auto
&
c
:
summary
)
for
(
const
auto
&
c
:
centroids
)
count
+=
c
.
count
;
}
Count
getCount
()
{
return
count
;
}
const
Centroids
&
getCentroids
()
const
{
return
centroids
;
}
void
reset
()
{
centroids
.
resize
(
0
);
count
=
0
;
unmerged
=
0
;
}
};
template
<
typename
T
>
class
QuantileTDigest
{
using
Value
=
Float32
;
using
Count
=
Float32
;
/** We store two t-digests. When an amount of elements in sub_tdigest become more than merge_threshold
* we merge sub_tdigest in main_tdigest and reset sub_tdigest. This method is needed to decrease an amount of
* centroids in t-digest (experiments show that after merge_threshold the size of t-digest significantly grows,
* but merging two big t-digest decreases it).
*/
TDigest
<
T
>
main_tdigest
;
TDigest
<
T
>
sub_tdigest
;
size_t
merge_threshold
=
1e7
;
/** Linear interpolation at the point x on the line (x1, y1)..(x2, y2)
*/
static
Value
interpolate
(
Value
x
,
Value
x1
,
Value
y1
,
Value
x2
,
Value
y2
)
{
double
k
=
(
x
-
x1
)
/
(
x2
-
x1
);
return
y1
+
k
*
(
y2
-
y1
);
}
void
mergeTDigests
()
{
main_tdigest
.
merge
(
sub_tdigest
);
sub_tdigest
.
reset
();
}
public:
void
add
(
T
x
,
UInt64
cnt
=
1
)
{
if
(
sub_tdigest
.
getCount
()
>=
merge_threshold
)
mergeTDigests
();
sub_tdigest
.
add
(
x
,
cnt
);
}
void
merge
(
const
QuantileTDigest
&
other
)
{
mergeTDigests
();
main_tdigest
.
merge
(
other
.
main_tdigest
);
main_tdigest
.
merge
(
other
.
sub_tdigest
);
}
void
serialize
(
WriteBuffer
&
buf
)
{
mergeTDigests
();
main_tdigest
.
serialize
(
buf
);
}
void
deserialize
(
ReadBuffer
&
buf
)
{
sub_tdigest
.
reset
();
main_tdigest
.
deserialize
(
buf
);
}
/** Calculates the quantile q [0, 1] based on the digest.
* For an empty digest returns NaN.
*/
template
<
typename
ResultType
>
ResultType
getImpl
(
Float64
level
)
{
if
(
summary
.
empty
())
mergeTDigests
();
auto
&
centroids
=
main_tdigest
.
getCentroids
();
if
(
centroids
.
empty
())
return
std
::
is_floating_point_v
<
ResultType
>
?
NAN
:
0
;
compress
();
main_tdigest
.
compress
();
if
(
summary
.
size
()
==
1
)
return
summary
.
front
().
mean
;
if
(
centroids
.
size
()
==
1
)
return
centroids
.
front
().
mean
;
Float64
x
=
level
*
count
;
Float64
x
=
level
*
main_tdigest
.
getCount
()
;
Float64
prev_x
=
0
;
Count
sum
=
0
;
Value
prev_mean
=
summary
.
front
().
mean
;
Value
prev_mean
=
centroids
.
front
().
mean
;
for
(
const
auto
&
c
:
summary
)
for
(
const
auto
&
c
:
centroids
)
{
Float64
current_x
=
sum
+
c
.
count
*
0.5
;
...
...
@@ -263,7 +332,7 @@ public:
prev_x
=
current_x
;
}
return
summary
.
back
().
mean
;
return
centroids
.
back
().
mean
;
}
/** Get multiple quantiles (`size` parts).
...
...
@@ -274,29 +343,32 @@ public:
template
<
typename
ResultType
>
void
getManyImpl
(
const
Float64
*
levels
,
const
size_t
*
levels_permutation
,
size_t
size
,
ResultType
*
result
)
{
if
(
summary
.
empty
())
mergeTDigests
();
auto
&
centroids
=
main_tdigest
.
getCentroids
();
if
(
centroids
.
empty
())
{
for
(
size_t
result_num
=
0
;
result_num
<
size
;
++
result_num
)
result
[
result_num
]
=
std
::
is_floating_point_v
<
ResultType
>
?
NAN
:
0
;
return
;
}
compress
();
main_tdigest
.
compress
();
if
(
summary
.
size
()
==
1
)
if
(
centroids
.
size
()
==
1
)
{
for
(
size_t
result_num
=
0
;
result_num
<
size
;
++
result_num
)
result
[
result_num
]
=
summary
.
front
().
mean
;
result
[
result_num
]
=
centroids
.
front
().
mean
;
return
;
}
Float64
x
=
levels
[
levels_permutation
[
0
]]
*
count
;
Float64
x
=
levels
[
levels_permutation
[
0
]]
*
main_tdigest
.
getCount
()
;
Float64
prev_x
=
0
;
Count
sum
=
0
;
Value
prev_mean
=
summary
.
front
().
mean
;
Value
prev_mean
=
centroids
.
front
().
mean
;
size_t
result_num
=
0
;
for
(
const
auto
&
c
:
summary
)
for
(
const
auto
&
c
:
centroids
)
{
Float64
current_x
=
sum
+
c
.
count
*
0.5
;
...
...
@@ -308,7 +380,7 @@ public:
if
(
result_num
>=
size
)
return
;
x
=
levels
[
levels_permutation
[
result_num
]]
*
count
;
x
=
levels
[
levels_permutation
[
result_num
]]
*
main_tdigest
.
getCount
()
;
}
sum
+=
c
.
count
;
...
...
@@ -316,7 +388,7 @@ public:
prev_x
=
current_x
;
}
auto
rest_of_results
=
summary
.
back
().
mean
;
auto
rest_of_results
=
centroids
.
back
().
mean
;
for
(;
result_num
<
size
;
++
result_num
)
result
[
levels_permutation
[
result_num
]]
=
rest_of_results
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录