提交 2ec141d5 编写于 作者: P proller 提交者: alexey-milovidov

Speedup of CacheDictionary (part1) [#CLICKHOUSE-2176] (#474)

* Auto version update to [54146]

* Fixing dictionaries [#CLICKHOUSE-2176]

* fix

* Auto version update to [54194]

* Revert "Auto version update to [54194]"

This reverts commit 3367678a046b9c9f8676922a70ac4c12ff2fd02f.

* wip

* wip

* clean

* const

* fix

* wip

* wip

* wip

* wip

* wip

* wip

* clean

* Fixing dictionaries [#CLICKHOUSE-2176]

* style

* style

* style

* struct

* clean

* clean

* clean
上级 bc9f8721
......@@ -229,13 +229,30 @@ private:
Attribute & getAttribute(const std::string & attribute_name) const;
struct FindResult {
const bool valid;
const bool outdated;
const size_t cell_idx;
};
FindResult findCellIdx(const Key & id, const CellMetadata::time_point_t now) const;
const std::string name;
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
mutable Poco::RWLock rw_lock;
// Actual size will be increased to match power of 2
const std::size_t size;
// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111)
const std::size_t size_overlap_mask;
// Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3
static constexpr std::size_t max_collision_length = 10;
const UInt64 zero_cell_idx{getCellIdx(0)};
std::map<std::string, std::size_t> attribute_index_by_name;
mutable std::vector<Attribute> attributes;
......
#include <functional>
#include <DB/Columns/ColumnsNumber.h>
#include <DB/Dictionaries/CacheDictionary.h>
#include <DB/Common/BitHelpers.h>
......@@ -34,7 +35,8 @@ CacheDictionary::CacheDictionary(const std::string & name, const DictionaryStruc
const std::size_t size)
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
size{roundUpToPowerOfTwoOrZero(size)},
size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))},
size_overlap_mask{this->size - 1},
cells{this->size},
rnd_engine{randomSeed()}
{
......@@ -173,6 +175,46 @@ void CacheDictionary::getString(
}
/// returns 'cell is valid' flag, 'cell is outdated' flag, cell_idx
/// true false found and valid
/// false true not found (something outdated, maybe our cell)
/// false false not found (other id stored with valid data)
/// true true impossible
///
/// todo: split this func to two: find_for_get and find_for_set
CacheDictionary::FindResult CacheDictionary::findCellIdx(const Key & id, const CellMetadata::time_point_t now) const
{
auto pos = getCellIdx(id);
auto oldest_id = pos;
auto oldest_time = CellMetadata::time_point_t::max();
const auto stop = pos + max_collision_length;
for (; pos < stop; ++pos)
{
const auto cell_idx = pos & size_overlap_mask;
const auto & cell = cells[cell_idx];
if (cell.id != id)
{
/// maybe we already found nearest expired cell (try minimize collision_length on insert)
if (oldest_time > now && oldest_time > cell.expiresAt())
{
oldest_time = cell.expiresAt();
oldest_id = cell_idx;
}
continue;
}
if (cell.expiresAt() < now)
{
return {false, true, cell_idx};
}
return {true, false, cell_idx};
}
return {false, false, oldest_id};
}
void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const
{
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
......@@ -189,26 +231,20 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. ids do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.id != id)
const auto find_result = findCellIdx(id, now);
const auto & cell_idx = find_result.cell_idx;
if (!find_result.valid)
{
++cache_not_found;
outdated_ids[id].push_back(row);
}
else if (cell.expiresAt() < now)
{
++cache_expired;
outdated_ids[id].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell = cells[cell_idx];
out[row] = !cell.isDefault();
}
}
......@@ -381,26 +417,26 @@ void CacheDictionary::getItemsNumberImpl(
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. ids do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.id != id)
{
++cache_not_found;
outdated_ids[id].push_back(row);
}
else if (cell.expiresAt() < now)
const auto find_result = findCellIdx(id, now);
if (!find_result.valid)
{
++cache_expired;
outdated_ids[id].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
}
}
......@@ -457,16 +493,17 @@ void CacheDictionary::getItemsString(
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
if (cell.id != id || cell.expiresAt() < now)
const auto find_result = findCellIdx(id, now);
if (!find_result.valid)
{
found_outdated_values = true;
break;
}
else
{
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
......@@ -499,22 +536,21 @@ void CacheDictionary::getItemsString(
for (const auto row : ext::range(0, ids.size()))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
if (cell.id != id)
{
++cache_not_found;
outdated_ids[id].push_back(row);
}
else if (cell.expiresAt() < now)
const auto find_result = findCellIdx(id, now);
if (!find_result.valid)
{
++cache_expired;
outdated_ids[id].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
if (!cell.isDefault())
......@@ -524,6 +560,7 @@ void CacheDictionary::getItemsString(
}
}
}
ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
......@@ -583,6 +620,8 @@ void CacheDictionary::update(
auto stream = source_ptr->loadIds(requested_ids);
stream->readPrefix();
const auto now = std::chrono::system_clock::now();
while (const auto block = stream->read())
{
const auto id_column = typeid_cast<const ColumnUInt64 *>(block.safeGetByPosition(0).column.get());
......@@ -601,7 +640,10 @@ void CacheDictionary::update(
for (const auto i : ext::range(0, ids.size()))
{
const auto id = ids[i];
const auto cell_idx = getCellIdx(id);
const auto find_result = findCellIdx(id, now);
const auto & cell_idx = find_result.cell_idx;
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
......@@ -637,6 +679,7 @@ void CacheDictionary::update(
size_t not_found_num = 0, found_num = 0;
const auto now = std::chrono::system_clock::now();
/// Check which ids have not been found and require setting null_value
for (const auto id_found_pair : remaining_ids)
{
......@@ -648,7 +691,10 @@ void CacheDictionary::update(
++not_found_num;
const auto id = id_found_pair.first;
const auto cell_idx = getCellIdx(id);
const auto find_result = findCellIdx(id, now);
const auto & cell_idx = find_result.cell_idx;
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册