提交 c77e8e98 编写于 作者: M Michael Kolupaev

clickhouse: improved integer sample size processing [#6201].

上级 60bf0939
...@@ -823,6 +823,18 @@ BlockInputStreams StorageMergeTree::read( ...@@ -823,6 +823,18 @@ BlockInputStreams StorageMergeTree::read(
PKCondition key_condition(query, context, sort_descr); PKCondition key_condition(query, context, sort_descr);
PKCondition date_condition(query, context, SortDescription(1, SortColumnDescription(date_column_name, 1))); PKCondition date_condition(query, context, SortDescription(1, SortColumnDescription(date_column_name, 1)));
typedef std::vector<DataPartPtr> PartsList;
PartsList parts;
/// Выберем куски, в которых могут быть данные, удовлетворяющие key_condition.
{
Poco::ScopedLock<Poco::FastMutex> lock(data_parts_mutex);
for (DataParts::iterator it = data_parts.begin(); it != data_parts.end(); ++it)
if (date_condition.mayBeTrueInRange(Row(1, static_cast<UInt64>((*it)->left_date)),Row(1, static_cast<UInt64>((*it)->right_date))))
parts.push_back(*it);
}
/// Семплирование. /// Семплирование.
Names column_names_to_read = column_names_to_return; Names column_names_to_read = column_names_to_return;
UInt64 sampling_column_value_limit = 0; UInt64 sampling_column_value_limit = 0;
...@@ -833,26 +845,32 @@ BlockInputStreams StorageMergeTree::read( ...@@ -833,26 +845,32 @@ BlockInputStreams StorageMergeTree::read(
ASTSelectQuery & select = *dynamic_cast<ASTSelectQuery*>(&*query); ASTSelectQuery & select = *dynamic_cast<ASTSelectQuery*>(&*query);
if (select.sample_size) if (select.sample_size)
{ {
double size = boost::apply_visitor(FieldVisitorConvertToNumber<double>(), dynamic_cast<ASTLiteral&>(*select.sample_size).value); double size = boost::apply_visitor(FieldVisitorConvertToNumber<double>(),
dynamic_cast<ASTLiteral&>(*select.sample_size).value);
if (size < 0) if (size < 0)
throw Exception("Negative sample size", ErrorCodes::ARGUMENT_OUT_OF_BOUND); throw Exception("Negative sample size", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (size > 1) if (size > 1)
{ {
size_t requested_count = boost::apply_visitor(FieldVisitorConvertToNumber<UInt64>(), dynamic_cast<ASTLiteral&>(*select.sample_size).value); size_t requested_count = boost::apply_visitor(FieldVisitorConvertToNumber<UInt64>(), dynamic_cast<ASTLiteral&>(*select.sample_size).value);
/// Узнаем количество строк в таблице. /// Узнаем, сколько строк мы бы прочли без семплирования.
LOG_DEBUG(log, "Preliminary index scan with condition: " << key_condition.toString());
size_t total_count = 0; size_t total_count = 0;
for (size_t i = 0; i < parts.size(); ++i)
{ {
Poco::ScopedLock<Poco::FastMutex> lock(data_parts_mutex); DataPartPtr & part = parts[i];
MarkRanges ranges = MergeTreeBlockInputStream::markRangesFromPkRange(full_path + part->name + '/',
for (DataParts::iterator it = data_parts.begin(); it != data_parts.end(); ++it) part->size,
total_count += (*it)->size; *this,
key_condition);
for (size_t j = 0; j < ranges.size(); ++j)
total_count += ranges[j].end - ranges[j].begin;
} }
total_count *= index_granularity; total_count *= index_granularity;
size = std::min(1., static_cast<double>(requested_count) / total_count); size = std::min(1., static_cast<double>(requested_count) / total_count);
LOG_DEBUG(log, "Relative sample size: " << size); LOG_DEBUG(log, "Selected relative sample size: " << size);
} }
UInt64 sampling_column_max = 0; UInt64 sampling_column_max = 0;
...@@ -869,6 +887,7 @@ BlockInputStreams StorageMergeTree::read( ...@@ -869,6 +887,7 @@ BlockInputStreams StorageMergeTree::read(
else else
throw Exception("Invalid sampling column type in storage parameters: " + type->getName() + ". Must be unsigned integer type.", ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER); throw Exception("Invalid sampling column type in storage parameters: " + type->getName() + ". Must be unsigned integer type.", ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER);
/// Добавим условие, чтобы отсечь еще что-нибудь при повторном просмотре индекса.
sampling_column_value_limit = static_cast<UInt64>(size * sampling_column_max); sampling_column_value_limit = static_cast<UInt64>(size * sampling_column_max);
if (!key_condition.addCondition(sampling_expression->getColumnName(), if (!key_condition.addCondition(sampling_expression->getColumnName(),
Range::RightBounded(sampling_column_value_limit, true))) Range::RightBounded(sampling_column_value_limit, true)))
...@@ -893,20 +912,8 @@ BlockInputStreams StorageMergeTree::read( ...@@ -893,20 +912,8 @@ BlockInputStreams StorageMergeTree::read(
column_names_to_read.erase(std::unique(column_names_to_read.begin(), column_names_to_read.end()), column_names_to_read.end()); column_names_to_read.erase(std::unique(column_names_to_read.begin(), column_names_to_read.end()), column_names_to_read.end());
} }
LOG_DEBUG(log, "key condition: " << key_condition.toString()); LOG_DEBUG(log, "Key condition: " << key_condition.toString());
LOG_DEBUG(log, "date condition: " << date_condition.toString()); LOG_DEBUG(log, "Date condition: " << date_condition.toString());
typedef std::vector<DataPartPtr> PartsList;
PartsList parts;
/// Выберем куски, в которых могут быть данные, удовлетворяющие key_condition.
{
Poco::ScopedLock<Poco::FastMutex> lock(data_parts_mutex);
for (DataParts::iterator it = data_parts.begin(); it != data_parts.end(); ++it)
if (date_condition.mayBeTrueInRange(Row(1, static_cast<UInt64>((*it)->left_date)),Row(1, static_cast<UInt64>((*it)->right_date))))
parts.push_back(*it);
}
RangesInDataParts parts_with_ranges; RangesInDataParts parts_with_ranges;
...@@ -918,9 +925,9 @@ BlockInputStreams StorageMergeTree::read( ...@@ -918,9 +925,9 @@ BlockInputStreams StorageMergeTree::read(
DataPartPtr & part = parts[i]; DataPartPtr & part = parts[i];
RangesInDataPart ranges(part); RangesInDataPart ranges(part);
ranges.ranges = MergeTreeBlockInputStream::markRangesFromPkRange(full_path + part->name + '/', ranges.ranges = MergeTreeBlockInputStream::markRangesFromPkRange(full_path + part->name + '/',
part->size, part->size,
*this, *this,
key_condition); key_condition);
if (!ranges.ranges.empty()) if (!ranges.ranges.empty())
{ {
parts_with_ranges.push_back(ranges); parts_with_ranges.push_back(ranges);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册