Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
2dot5
ClickHouse
提交
fd25fab6
C
ClickHouse
项目概览
2dot5
/
ClickHouse
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
C
ClickHouse
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fd25fab6
编写于
2月 28, 2019
作者:
N
Nikolai Kochetov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Updated MergeSortingTransform.
上级
19fa17fd
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
361 addition
and
42 deletion
+361
-42
dbms/src/Processors/Transforms/MergeSortingTransform.cpp
dbms/src/Processors/Transforms/MergeSortingTransform.cpp
+332
-26
dbms/src/Processors/Transforms/MergeSortingTransform.h
dbms/src/Processors/Transforms/MergeSortingTransform.h
+29
-16
未找到文件。
dbms/src/Processors/Transforms/MergeSortingTransform.cpp
浏览文件 @
fd25fab6
...
...
@@ -13,6 +13,8 @@
#include <DataStreams/NativeBlockOutputStream.h>
#include <queue>
#include <Processors/ISource.h>
#include <Processors/Transforms/MergingSortedTransform.h>
namespace
ProfileEvents
...
...
@@ -25,18 +27,92 @@ namespace ProfileEvents
namespace
DB
{
class
SinkToNativeStream
:
public
IAccumulatingTransform
{
public:
SinkToNativeStream
(
const
Block
&
header
,
Logger
*
log_
,
std
::
string
path_
)
:
IAccumulatingTransform
(
header
,
header
),
log
(
log_
)
,
path
(
std
::
move
(
path_
)),
file_buf
(
path
),
compressed_buf
(
file_buf
)
,
stream
(
std
::
make_shared
<
NativeBlockOutputStream
>
(
compressed_buf
,
0
,
header
))
{
LOG_INFO
(
log
,
"Sorting and writing part of data into temporary file "
+
path
);
ProfileEvents
::
increment
(
ProfileEvents
::
ExternalSortWritePart
);
stream
->
writePrefix
();
}
String
getName
()
const
override
{
return
"SinkToNativeStream"
;
}
void
consume
(
Chunk
chunk
)
override
{
stream
->
write
(
getInputPort
().
getHeader
().
cloneWithColumns
(
chunk
.
detachColumns
()));
}
Chunk
generate
()
override
{
if
(
stream
)
{
stream
->
writeSuffix
();
LOG_INFO
(
log
,
"Done writing part of data into temporary file "
+
path
);
}
stream
.
reset
();
return
Chunk
();
}
private:
Logger
*
log
;
std
::
string
path
;
WriteBufferFromFile
file_buf
;
CompressedWriteBuffer
compressed_buf
;
BlockOutputStreamPtr
stream
;
};
class
SourceFromNativeStream
:
public
ISource
{
public:
SourceFromNativeStream
(
const
Block
&
header
,
const
std
::
string
&
path
)
:
ISource
(
header
),
file_in
(
path
),
compressed_in
(
file_in
)
,
block_in
(
std
::
make_shared
<
NativeBlockInputStream
>
(
compressed_in
,
header
,
0
))
{
block_in
->
readPrefix
();
}
String
getName
()
const
override
{
return
"SourceFromNativeStream"
;
}
Chunk
generate
()
override
{
if
(
!
block_in
)
return
{};
auto
block
=
block_in
->
read
();
if
(
!
block
)
{
block_in
.
reset
();
return
{};
}
UInt64
num_rows
=
block
.
rows
();
return
Chunk
(
block
.
getColumns
(),
num_rows
);
}
private:
ReadBufferFromFile
file_in
;
CompressedReadBuffer
compressed_in
;
BlockInputStreamPtr
block_in
;
};
/** Part of implementation. Merging array of ready (already read from somewhere) chunks.
* Returns result of merge as stream of chunks, not more than 'max_merged_block_size' rows in each.
*/
class
MergeSorter
{
public:
MergeSorter
(
Chunks
&
chunks_
,
SortDescription
&
description_
,
size_t
max_merged_block_size_
,
UInt64
limit_
=
0
);
MergeSorter
(
Chunks
chunks_
,
SortDescription
&
description_
,
size_t
max_merged_block_size_
,
UInt64
limit_
);
Chunk
read
();
private:
Chunks
&
chunks
;
Chunks
chunks
;
SortDescription
description
;
size_t
max_merged_block_size
;
UInt64
limit
;
...
...
@@ -57,8 +133,23 @@ private:
Chunk
mergeImpl
(
std
::
priority_queue
<
TSortCursor
>
&
queue
);
};
MergeSorter
::
MergeSorter
(
Chunks
&
chunks_
,
SortDescription
&
description_
,
size_t
max_merged_block_size_
,
UInt64
limit_
)
:
chunks
(
chunks_
),
description
(
description_
),
max_merged_block_size
(
max_merged_block_size_
),
limit
(
limit_
)
class
MergeSorterSource
:
public
ISource
{
public:
MergeSorterSource
(
Block
header
,
Chunks
chunks
,
SortDescription
&
description
,
size_t
max_merged_block_size
,
UInt64
limit
)
:
ISource
(
std
::
move
(
header
)),
merge_sorter
(
std
::
move
(
chunks
),
description
,
max_merged_block_size
,
limit
)
{}
String
getName
()
const
override
{
return
"MergeSorterSource"
;
}
protected:
Chunk
generate
()
override
{
return
merge_sorter
.
read
();
}
private:
MergeSorter
merge_sorter
;
};
MergeSorter
::
MergeSorter
(
Chunks
chunks_
,
SortDescription
&
description_
,
size_t
max_merged_block_size_
,
UInt64
limit_
)
:
chunks
(
std
::
move
(
chunks_
)),
description
(
description_
),
max_merged_block_size
(
max_merged_block_size_
),
limit
(
limit_
)
{
Chunks
nonempty_chunks
;
for
(
auto
&
chunk
:
chunks
)
...
...
@@ -155,12 +246,12 @@ MergeSortingTransform::MergeSortingTransform(
size_t
max_merged_block_size_
,
UInt64
limit_
,
size_t
max_bytes_before_remerge_
,
size_t
max_bytes_before_external_sort_
,
const
std
::
string
&
tmp_path_
)
:
I
AccumulatingTransform
(
header
,
header
)
:
I
Processor
({
header
},
{
header
}
)
,
description
(
description_
),
max_merged_block_size
(
max_merged_block_size_
),
limit
(
limit_
)
,
max_bytes_before_remerge
(
max_bytes_before_remerge_
)
,
max_bytes_before_external_sort
(
max_bytes_before_external_sort_
),
tmp_path
(
tmp_path_
)
{
auto
&
sample
=
getInputPor
t
().
getHeader
();
auto
&
sample
=
inputs
.
fron
t
().
getHeader
();
/// Replace column names to column position in sort_description.
for
(
auto
&
column_description
:
description
)
...
...
@@ -202,6 +293,191 @@ MergeSortingTransform::MergeSortingTransform(
description
.
swap
(
description_without_constants
);
}
IProcessor
::
Status
MergeSortingTransform
::
prepare
()
{
if
(
stage
==
Stage
::
Serialize
)
{
if
(
current_processor
)
return
Status
::
ExpandPipeline
;
auto
status
=
prepareSerialize
();
if
(
status
!=
Status
::
Finished
)
return
status
;
stage
=
Stage
::
Consume
;
}
if
(
stage
==
Stage
::
Consume
)
{
auto
status
=
prepareConsume
();
if
(
status
!=
Status
::
Finished
)
return
status
;
stage
=
Stage
::
Generate
;
}
/// stage == Stage::Generate
if
(
!
generated_prefix
)
return
Status
::
Ready
;
if
(
!
processors
.
empty
())
return
Status
::
ExpandPipeline
;
return
prepareGenerate
();
}
IProcessor
::
Status
MergeSortingTransform
::
prepareConsume
()
{
auto
&
input
=
inputs
.
front
();
auto
&
output
=
outputs
.
front
();
/// Check can output.
if
(
output
.
isFinished
())
{
input
.
close
();
return
Status
::
Finished
;
}
if
(
!
output
.
canPush
())
{
input
.
setNotNeeded
();
return
Status
::
PortFull
;
}
if
(
generated_chunk
)
output
.
push
(
std
::
move
(
generated_chunk
));
/// Check can input.
if
(
!
current_chunk
)
{
if
(
input
.
isFinished
())
return
Status
::
Finished
;
input
.
setNeeded
();
if
(
!
input
.
hasData
())
return
Status
::
NeedData
;
current_chunk
=
input
.
pull
();
}
/// Now consume.
return
Status
::
Ready
;
}
IProcessor
::
Status
MergeSortingTransform
::
prepareSerialize
()
{
auto
&
input
=
inputs
.
back
();
auto
&
output
=
outputs
.
back
();
if
(
output
.
isFinished
())
{
input
.
close
();
return
Status
::
Finished
;
}
if
(
!
output
.
canPush
())
{
input
.
setNotNeeded
();
return
Status
::
PortFull
;
}
if
(
current_chunk
)
output
.
push
(
std
::
move
(
current_chunk
));
if
(
merge_sorter
)
return
Status
::
Ready
;
output
.
finish
();
if
(
input
.
isFinished
())
return
Status
::
Finished
;
input
.
setNeeded
();
if
(
!
input
.
hasData
())
return
Status
::
NeedData
;
input
.
pull
();
input
.
close
();
return
Status
::
Finished
;
}
IProcessor
::
Status
MergeSortingTransform
::
prepareGenerate
()
{
auto
&
output
=
outputs
.
front
();
if
(
output
.
isFinished
())
return
Status
::
Finished
;
if
(
!
output
.
canPush
())
return
Status
::
PortFull
;
if
(
merge_sorter
)
{
if
(
!
current_chunk
)
return
Status
::
Ready
;
output
.
push
(
std
::
move
(
current_chunk
));
return
Status
::
PortFull
;
}
else
{
auto
&
input
=
inputs
.
back
();
if
(
input
.
isFinished
())
{
output
.
finish
();
return
Status
::
Finished
;
}
input
.
setNeeded
();
if
(
!
input
.
hasData
())
return
Status
::
NeedData
;
output
.
push
(
input
.
pull
());
return
Status
::
PortFull
;
}
}
void
MergeSortingTransform
::
work
()
{
if
(
stage
==
Stage
::
Consume
)
consume
(
std
::
move
(
current_chunk
));
if
(
stage
==
Stage
::
Serialize
)
serialize
();
if
(
stage
==
Stage
::
Generate
)
generate
();
}
Processors
MergeSortingTransform
::
expandPipeline
()
{
if
(
!
processors
.
empty
())
{
/// Before generate.
inputs
.
emplace_back
(
header_without_constants
,
this
);
connect
(
current_processor
->
getOutputs
().
front
(),
getInputs
().
back
());
current_processor
.
reset
();
inputs
.
back
().
setNeeded
();
return
std
::
move
(
processors
);
}
else
{
/// Before serialize.
inputs
.
emplace_back
(
header_without_constants
,
this
);
outputs
.
emplace_back
(
header_without_constants
,
this
);
connect
(
current_processor
->
getOutputs
().
front
(),
getInputs
().
back
());
connect
(
getOutputs
().
back
(),
current_processor
->
getInputs
().
front
());
inputs
.
back
().
setNeeded
();
return
{
std
::
move
(
current_processor
)};
}
}
void
MergeSortingTransform
::
consume
(
Chunk
chunk
)
{
/** Algorithm:
...
...
@@ -215,7 +491,7 @@ void MergeSortingTransform::consume(Chunk chunk)
/// Return the chunk as is.
if
(
description
.
empty
())
{
setReadyChunk
(
std
::
move
(
chunk
)
);
generated_chunk
=
std
::
move
(
chunk
);
return
;
}
...
...
@@ -246,36 +522,66 @@ void MergeSortingTransform::consume(Chunk chunk)
Poco
::
File
(
tmp_path
).
createDirectories
();
temporary_files
.
emplace_back
(
std
::
make_unique
<
Poco
::
TemporaryFile
>
(
tmp_path
));
const
std
::
string
&
path
=
temporary_files
.
back
()
->
path
();
WriteBufferFromFile
file_buf
(
path
);
CompressedWriteBuffer
compressed_buf
(
file_buf
);
NativeBlockOutputStream
block_out
(
compressed_buf
,
0
,
header_without_constants
);
MergeSorter
merge_sorter
(
chunks
,
description
,
max_merged_block_size
,
limit
);
merge_sorter
=
std
::
make_unique
<
MergeSorter
>
(
std
::
move
(
chunks
),
description
,
max_merged_block_size
,
limit
);
current_processor
=
std
::
make_shared
<
SinkToNativeStream
>
(
header_without_constants
,
log
,
path
);
LOG_INFO
(
log
,
"Sorting and writing part of data into temporary file "
+
path
);
ProfileEvents
::
increment
(
ProfileEvents
::
ExternalSortWritePart
);
stage
=
Stage
::
Serialize
;
sum_bytes_in_blocks
=
0
;
sum_rows_in_blocks
=
0
;
}
}
/// NOTE. Possibly limit disk usage.
/// NOTE. This should be another one processor.
/// TODO: Rewrite this code when processors could be able to create another processors.
block_out
.
writePrefix
();
while
(
auto
next
=
merge_sorter
.
read
())
void
MergeSortingTransform
::
serialize
()
{
current_chunk
=
merge_sorter
->
read
();
if
(
!
current_chunk
)
merge_sorter
.
reset
();
}
void
MergeSortingTransform
::
generate
()
{
if
(
!
generated_prefix
)
{
if
(
temporary_files
.
empty
())
merge_sorter
=
std
::
make_unique
<
MergeSorter
>
(
std
::
move
(
chunks
),
description
,
max_merged_block_size
,
limit
);
else
{
auto
block
=
header_without_constants
.
cloneWithColumns
(
next
.
detachColumns
());
block_out
.
write
(
block
);
ProfileEvents
::
increment
(
ProfileEvents
::
ExternalSortMerge
);
LOG_INFO
(
log
,
"There are "
<<
temporary_files
.
size
()
<<
" temporary sorted parts to merge."
);
/// Create sorted streams to merge.
for
(
const
auto
&
file
:
temporary_files
)
processors
.
emplace_back
(
std
::
make_unique
<
SourceFromNativeStream
>
(
header_without_constants
,
file
->
path
()));
if
(
!
chunks
.
empty
())
processors
.
emplace_back
(
std
::
make_shared
<
MergeSorterSource
>
(
header_without_constants
,
std
::
move
(
chunks
),
description
,
max_merged_block_size
,
limit
));
current_processor
=
std
::
make_shared
<
MergingSortedTransform
>
(
header_without_constants
,
processors
.
size
(),
description
,
max_merged_block_size
,
limit
);
auto
next_input
=
current_processor
->
getInputs
().
begin
();
for
(
auto
&
processor
:
processors
)
connect
(
processor
->
getOutputs
().
front
(),
*
(
next_input
++
));
processors
.
push_back
(
current_processor
);
}
block_out
.
writeSuffix
();
LOG_INFO
(
log
,
"Done writing part of data into temporary file "
+
path
);
generated_prefix
=
true
;
}
chunks
.
clear
();
sum_bytes_in_blocks
=
0
;
sum_rows_in_blocks
=
0
;
if
(
merge_sorter
)
{
generated_chunk
=
merge_sorter
->
read
();
if
(
!
generated_chunk
)
merge_sorter
.
reset
();
}
}
void
MergeSortingTransform
::
remerge
()
{
LOG_DEBUG
(
log
,
"Re-merging intermediate ORDER BY data ("
<<
chunks
.
size
()
<<
" blocks with "
<<
sum_rows_in_blocks
<<
" rows) to save memory consumption"
);
LOG_DEBUG
(
log
,
"Re-merging intermediate ORDER BY data ("
<<
chunks
.
size
()
<<
" blocks with "
<<
sum_rows_in_blocks
<<
" rows) to save memory consumption"
);
/// NOTE Maybe concat all blocks and partial sort will be faster than merge?
MergeSorter
merge_sorter
(
chunks
,
description
,
max_merged_block_size
,
limit
);
...
...
dbms/src/Processors/Transforms/MergeSortingTransform.h
浏览文件 @
fd25fab6
...
...
@@ -14,7 +14,9 @@
namespace
DB
{
class
MergeSortingTransform
:
public
IAccumulatingTransform
class
MergeSorter
;
class
MergeSortingTransform
:
public
IProcessor
{
public:
/// limit - if not 0, allowed to return just first 'limit' rows in sorted order.
...
...
@@ -27,8 +29,10 @@ public:
String
getName
()
const
override
{
return
"MergeSortingTransform"
;
}
protected:
void
consume
(
Chunk
chunk
)
override
;
Chunk
generate
()
override
;
Status
prepare
()
override
;
void
work
()
override
;
Processors
expandPipeline
()
override
;
private:
SortDescription
description
;
...
...
@@ -58,25 +62,34 @@ private:
/// Everything below is for external sorting.
std
::
vector
<
std
::
unique_ptr
<
Poco
::
TemporaryFile
>>
temporary_files
;
/// For reading data from temporary file.
struct
TemporaryFileStream
{
ReadBufferFromFile
file_in
;
CompressedReadBuffer
compressed_in
;
BlockInputStreamPtr
block_in
;
/// Merge all accumulated blocks to keep no more than limit rows.
void
remerge
();
TemporaryFileStream
(
const
std
::
string
&
path
,
const
Block
&
header
)
:
file_in
(
path
),
compressed_in
(
file_in
),
block_in
(
std
::
make_shared
<
NativeBlockInputStream
>
(
compressed_in
,
header
,
0
))
{}
void
removeConstColumns
(
Chunk
&
chunk
);
enum
class
Stage
{
Consume
=
0
,
Generate
,
Serialize
,
};
std
::
vector
<
std
::
unique_ptr
<
TemporaryFileStream
>>
temporary_inputs
;
Stage
stage
=
Stage
::
Consume
;
bool
generated_prefix
=
false
;
Chunk
current_chunk
;
Chunk
generated_chunk
;
BlockInputStreams
inputs_to_merge
;
std
::
unique_ptr
<
MergeSorter
>
merge_sorter
;
ProcessorPtr
current_processor
;
Processors
processors
;
/// Merge all accumulated blocks to keep no more than limit rows.
void
remerge
();
Status
prepareConsume
();
Status
prepareSerialize
();
Status
prepareGenerate
();
void
removeConstColumns
(
Chunk
&
chunk
);
void
consume
(
Chunk
chunk
);
void
serialize
();
void
generate
();
};
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录