提交 7df722d7 编写于 作者: P proller 提交者: alexey-milovidov

Speedup of CacheDictionary (part2: complex) [#CLICKHOUSE-2176] (#502)

* Auto version update to [54146]

* Fixing dictionaries [#CLICKHOUSE-2176]

* fix

* Auto version update to [54194]

* Revert "Auto version update to [54194]"

This reverts commit 3367678a046b9c9f8676922a70ac4c12ff2fd02f.

* wip

* wip

* clean

* const

* fix

* wip

* wip

* wip

* wip

* wip

* wip

* clean

* Fixing dictionaries [#CLICKHOUSE-2176]

* style

* style

* style

* wip

* wip

* struct

* clean

* clean

* clean

* wip

* wip broken

* wip

* wip

* wip

* wip

* wip

* wip

* clean

* clean

* space

* tester

* wip

* fix

* clean

* style
上级 1fa8c17f
......@@ -229,10 +229,11 @@ private:
Attribute & getAttribute(const std::string & attribute_name) const;
struct FindResult {
struct FindResult
{
const size_t cell_idx;
const bool valid;
const bool outdated;
const size_t cell_idx;
};
FindResult findCellIdx(const Key & id, const CellMetadata::time_point_t now) const;
......@@ -244,13 +245,13 @@ private:
mutable Poco::RWLock rw_lock;
// Actual size will be increased to match power of 2
/// Actual size will be increased to match power of 2
const std::size_t size;
// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111)
/// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111)
const std::size_t size_overlap_mask;
// Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3
/// Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3
static constexpr std::size_t max_collision_length = 10;
const UInt64 zero_cell_idx{getCellIdx(0)};
......
......@@ -257,6 +257,20 @@ private:
static StringRef copyIntoArena(StringRef src, Arena & arena);
StringRef copyKey(const StringRef key) const;
struct FindResult
{
const size_t cell_idx;
const bool valid;
const bool outdated;
};
FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const;
FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now) const
{
const auto hash = StringRefHash{}(key);
return findCellIdx(key, now, hash);
};
const std::string name;
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
......@@ -264,7 +278,16 @@ private:
const std::string key_description{dict_struct.getKeyDescription()};
mutable Poco::RWLock rw_lock;
/// Actual size will be increased to match power of 2
const std::size_t size;
/// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111)
const std::size_t size_overlap_mask;
/// Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3
static constexpr std::size_t max_collision_length = 10;
const UInt64 zero_cell_idx{getCellIdx(StringRef{})};
std::map<std::string, std::size_t> attribute_index_by_name;
mutable std::vector<Attribute> attributes;
......
......@@ -25,7 +25,7 @@ namespace ErrorCodes
inline UInt64 CacheDictionary::getCellIdx(const Key id) const
{
const auto hash = intHash64(id);
const auto idx = hash & (size - 1);
const auto idx = hash & size_overlap_mask;
return idx;
}
......@@ -175,7 +175,7 @@ void CacheDictionary::getString(
}
/// returns 'cell is valid' flag, 'cell is outdated' flag, cell_idx
/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag
/// true false found and valid
/// false true not found (something outdated, maybe our cell)
/// false false not found (other id stored with valid data)
......@@ -206,13 +206,13 @@ CacheDictionary::FindResult CacheDictionary::findCellIdx(const Key & id, const C
if (cell.expiresAt() < now)
{
return {false, true, cell_idx};
return {cell_idx, false, true};
}
return {true, false, cell_idx};
return {cell_idx, true, false};
}
return {false, false, oldest_id};
return {oldest_id, false, false};
}
void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8> & out) const
......
......@@ -5,7 +5,6 @@
#include <DB/Common/ProfilingScopedRWLock.h>
#include <ext/range.hpp>
namespace DB
{
......@@ -20,7 +19,7 @@ namespace ErrorCodes
inline UInt64 ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const
{
const auto hash = StringRefHash{}(key);
const auto idx = hash & (size - 1);
const auto idx = hash & size_overlap_mask;
return idx;
}
......@@ -29,7 +28,9 @@ ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const std::string & name, c
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
const size_t size)
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
size{roundUpToPowerOfTwoOrZero(size)}, rnd_engine{randomSeed()}
size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))},
size_overlap_mask{this->size - 1},
rnd_engine{randomSeed()}
{
if (!this->source_ptr->supportsSelectiveLoad())
throw Exception{
......@@ -174,6 +175,52 @@ void ComplexKeyCacheDictionary::getString(
getItemsString(attribute, key_columns, out, [&] (const size_t) { return StringRef{def}; });
}
/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag,
/// true false found and valid
/// false true not found (something outdated, maybe our cell)
/// false false not found (other id stored with valid data)
/// true true impossible
///
/// todo: split this func to two: find_for_get and find_for_set
ComplexKeyCacheDictionary::FindResult ComplexKeyCacheDictionary::findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const
{
auto pos = hash;
auto oldest_id = pos;
auto oldest_time = CellMetadata::time_point_t::max();
const auto stop = pos + max_collision_length;
for (; pos < stop; ++pos)
{
const auto cell_idx = pos & size_overlap_mask;
const auto & cell = cells[cell_idx];
if (cell.hash != hash || cell.key != key)
{
/// maybe we already found nearest expired cell
if (oldest_time > now && oldest_time > cell.expiresAt())
{
oldest_time = cell.expiresAt();
oldest_id = cell_idx;
}
continue;
}
if (cell.expiresAt() < now)
{
return {cell_idx, false, true};
}
return {cell_idx, true, false};
}
oldest_id &= size_overlap_mask;
return {oldest_id, false, false};
}
void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
{
dict_struct.validateKeyTypes(key_types);
......@@ -181,11 +228,12 @@ void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, co
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<size_t>> outdated_keys;
const auto rows = key_columns.front()->size();
const auto rows_num = key_columns.front()->size();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows);
PODArray<StringRef> keys_array(rows_num);
size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
{
......@@ -193,31 +241,28 @@ void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, co
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const size_t cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
const auto find_result = findCellIdx(key, now);
const auto & cell_idx = find_result.cell_idx;
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.hash != hash || cell.key != key)
{
++cache_not_found;
outdated_keys[key].push_back(row);
}
else if (cell.expiresAt() < now)
if (!find_result.valid)
{
++cache_expired;
outdated_keys[key].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell = cells[cell_idx];
out[row] = !cell.isDefault();
}
}
......@@ -226,8 +271,8 @@ void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, co
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
......@@ -376,11 +421,11 @@ void ComplexKeyCacheDictionary::getItemsNumberImpl(
MapType<std::vector<size_t>> outdated_keys;
auto & attribute_array = std::get<ContainerPtrType<AttributeType>>(attribute.arrays);
const auto rows = key_columns.front()->size();
const auto rows_num = key_columns.front()->size();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows);
PODArray<StringRef> keys_array(rows_num);
size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
{
......@@ -388,31 +433,30 @@ void ComplexKeyCacheDictionary::getItemsNumberImpl(
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const size_t cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
const auto find_result = findCellIdx(key, now);
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.hash != hash || cell.key != key)
{
++cache_not_found;
outdated_keys[key].push_back(row);
}
else if (cell.expiresAt() < now)
if (!find_result.valid)
{
++cache_expired;
outdated_keys[key].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
}
}
......@@ -420,9 +464,8 @@ void ComplexKeyCacheDictionary::getItemsNumberImpl(
ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
......@@ -451,9 +494,9 @@ void ComplexKeyCacheDictionary::getItemsString(
Attribute & attribute, const ConstColumnPlainPtrs & key_columns, ColumnString * out,
DefaultGetter && get_default) const
{
const auto rows = key_columns.front()->size();
const auto rows_num = key_columns.front()->size();
/// save on some allocations
out->getOffsets().reserve(rows);
out->getOffsets().reserve(rows_num);
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
......@@ -469,21 +512,21 @@ void ComplexKeyCacheDictionary::getItemsString(
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, discard on fail
for (const auto row : ext::range(0, rows))
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
SCOPE_EXIT(temporary_keys_pool.rollback(key.size));
const auto hash = StringRefHash{}(key);
const size_t cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
const auto find_result = findCellIdx(key, now);
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
if (!find_result.valid)
{
found_outdated_values = true;
break;
}
else
{
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
......@@ -493,8 +536,8 @@ void ComplexKeyCacheDictionary::getItemsString(
/// optimistic code completed successfully
if (!found_outdated_values)
{
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows, std::memory_order_release);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num, std::memory_order_release);
return;
}
......@@ -506,7 +549,7 @@ void ComplexKeyCacheDictionary::getItemsString(
MapType<std::vector<size_t>> outdated_keys;
/// we are going to store every string separately
MapType<StringRef> map;
PODArray<StringRef> keys_array(rows);
PODArray<StringRef> keys_array(rows_num);
size_t total_length = 0;
size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
......@@ -514,27 +557,25 @@ void ComplexKeyCacheDictionary::getItemsString(
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
const auto now = std::chrono::system_clock::now();
for (const auto row : ext::range(0, rows))
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const size_t cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
const auto find_result = findCellIdx(key, now);
if (cell.hash != hash || cell.key != key)
{
++cache_not_found;
outdated_keys[key].push_back(row);
}
else if (cell.expiresAt() < now)
if (!find_result.valid)
{
++cache_expired;
outdated_keys[key].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
if (!cell.isDefault())
......@@ -548,8 +589,8 @@ void ComplexKeyCacheDictionary::getItemsString(
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
/// request new values
if (!outdated_keys.empty())
......@@ -614,6 +655,7 @@ void ComplexKeyCacheDictionary::update(
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
const auto now = std::chrono::system_clock::now();
while (const auto block = stream->read())
{
......@@ -632,13 +674,14 @@ void ComplexKeyCacheDictionary::update(
return block.safeGetByPosition(keys_size + attribute_idx).column.get();
});
const auto rows = block.rows();
const auto rows_num = block.rows();
for (const auto row : ext::range(0, rows))
for (const auto row : ext::range(0, rows_num))
{
auto key = allocKey(row, key_columns, keys);
const auto hash = StringRefHash{}(key);
const size_t cell_idx = hash & (size - 1);
const auto find_result = findCellIdx(key, now, hash);
const auto & cell_idx = find_result.cell_idx;
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
......@@ -691,6 +734,8 @@ void ComplexKeyCacheDictionary::update(
size_t found_num = 0;
size_t not_found_num = 0;
const auto now = std::chrono::system_clock::now();
/// Check which ids have not been found and require setting null_value
for (const auto key_found_pair : remaining_keys)
{
......@@ -704,7 +749,8 @@ void ComplexKeyCacheDictionary::update(
auto key = key_found_pair.first;
const auto hash = StringRefHash{}(key);
const size_t cell_idx = hash & (size - 1);
const auto find_result = findCellIdx(key, now, hash);
const auto & cell_idx = find_result.cell_idx;
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册