From 7df722d731b57398b5c119ae177cb7f03f57e23a Mon Sep 17 00:00:00 2001 From: proller Date: Fri, 17 Feb 2017 23:37:03 +0300 Subject: [PATCH] Speedup of CacheDictionary (part2: complex) [#CLICKHOUSE-2176] (#502) * Auto version update to [54146] * Fixing dictionaries [#CLICKHOUSE-2176] * fix * Auto version update to [54194] * Revert "Auto version update to [54194]" This reverts commit 3367678a046b9c9f8676922a70ac4c12ff2fd02f. * wip * wip * clean * const * fix * wip * wip * wip * wip * wip * wip * clean * Fixing dictionaries [#CLICKHOUSE-2176] * style * style * style * wip * wip * struct * clean * clean * clean * wip * wip broken * wip * wip * wip * wip * wip * wip * clean * clean * space * tester * wip * fix * clean * style --- .../include/DB/Dictionaries/CacheDictionary.h | 11 +- .../Dictionaries/ComplexKeyCacheDictionary.h | 23 +++ dbms/src/Dictionaries/CacheDictionary.cpp | 10 +- .../ComplexKeyCacheDictionary.cpp | 170 +++++++++++------- 4 files changed, 142 insertions(+), 72 deletions(-) diff --git a/dbms/include/DB/Dictionaries/CacheDictionary.h b/dbms/include/DB/Dictionaries/CacheDictionary.h index 99b1ec54b8..186882b6d9 100644 --- a/dbms/include/DB/Dictionaries/CacheDictionary.h +++ b/dbms/include/DB/Dictionaries/CacheDictionary.h @@ -229,10 +229,11 @@ private: Attribute & getAttribute(const std::string & attribute_name) const; - struct FindResult { + struct FindResult + { + const size_t cell_idx; const bool valid; const bool outdated; - const size_t cell_idx; }; FindResult findCellIdx(const Key & id, const CellMetadata::time_point_t now) const; @@ -244,13 +245,13 @@ private: mutable Poco::RWLock rw_lock; - // Actual size will be increased to match power of 2 + /// Actual size will be increased to match power of 2 const std::size_t size; - // all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111) + /// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111) const std::size_t size_overlap_mask; - // Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3 + /// Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3 static constexpr std::size_t max_collision_length = 10; const UInt64 zero_cell_idx{getCellIdx(0)}; diff --git a/dbms/include/DB/Dictionaries/ComplexKeyCacheDictionary.h b/dbms/include/DB/Dictionaries/ComplexKeyCacheDictionary.h index 6ee9d6a433..e81d6cc510 100644 --- a/dbms/include/DB/Dictionaries/ComplexKeyCacheDictionary.h +++ b/dbms/include/DB/Dictionaries/ComplexKeyCacheDictionary.h @@ -257,6 +257,20 @@ private: static StringRef copyIntoArena(StringRef src, Arena & arena); StringRef copyKey(const StringRef key) const; + struct FindResult + { + const size_t cell_idx; + const bool valid; + const bool outdated; + }; + + FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const; + FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now) const + { + const auto hash = StringRefHash{}(key); + return findCellIdx(key, now, hash); + }; + const std::string name; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; @@ -264,7 +278,16 @@ private: const std::string key_description{dict_struct.getKeyDescription()}; mutable Poco::RWLock rw_lock; + + /// Actual size will be increased to match power of 2 const std::size_t size; + + /// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111) + const std::size_t size_overlap_mask; + + /// Max tries to find cell, overlaped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3 + static constexpr std::size_t max_collision_length = 10; + const UInt64 zero_cell_idx{getCellIdx(StringRef{})}; std::map attribute_index_by_name; mutable std::vector attributes; diff --git a/dbms/src/Dictionaries/CacheDictionary.cpp b/dbms/src/Dictionaries/CacheDictionary.cpp index 59a9855141..b85aad45cf 100644 --- a/dbms/src/Dictionaries/CacheDictionary.cpp +++ b/dbms/src/Dictionaries/CacheDictionary.cpp @@ -25,7 +25,7 @@ namespace ErrorCodes inline UInt64 CacheDictionary::getCellIdx(const Key id) const { const auto hash = intHash64(id); - const auto idx = hash & (size - 1); + const auto idx = hash & size_overlap_mask; return idx; } @@ -175,7 +175,7 @@ void CacheDictionary::getString( } -/// returns 'cell is valid' flag, 'cell is outdated' flag, cell_idx +/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag /// true false found and valid /// false true not found (something outdated, maybe our cell) /// false false not found (other id stored with valid data) @@ -206,13 +206,13 @@ CacheDictionary::FindResult CacheDictionary::findCellIdx(const Key & id, const C if (cell.expiresAt() < now) { - return {false, true, cell_idx}; + return {cell_idx, false, true}; } - return {true, false, cell_idx}; + return {cell_idx, true, false}; } - return {false, false, oldest_id}; + return {oldest_id, false, false}; } void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray & out) const diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp index d4d2302d8c..ddc8d5e483 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp @@ -5,7 +5,6 @@ #include #include - namespace DB { @@ -20,7 +19,7 @@ namespace ErrorCodes inline UInt64 ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const { const auto hash = StringRefHash{}(key); - const auto idx = hash & (size - 1); + const auto idx = hash & size_overlap_mask; return idx; } @@ -29,7 +28,9 @@ ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const std::string & name, c DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, const size_t size) : name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime), - size{roundUpToPowerOfTwoOrZero(size)}, rnd_engine{randomSeed()} + size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))}, + size_overlap_mask{this->size - 1}, + rnd_engine{randomSeed()} { if (!this->source_ptr->supportsSelectiveLoad()) throw Exception{ @@ -174,6 +175,52 @@ void ComplexKeyCacheDictionary::getString( getItemsString(attribute, key_columns, out, [&] (const size_t) { return StringRef{def}; }); } + + +/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag, +/// true false found and valid +/// false true not found (something outdated, maybe our cell) +/// false false not found (other id stored with valid data) +/// true true impossible +/// +/// todo: split this func to two: find_for_get and find_for_set +ComplexKeyCacheDictionary::FindResult ComplexKeyCacheDictionary::findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const +{ + auto pos = hash; + auto oldest_id = pos; + auto oldest_time = CellMetadata::time_point_t::max(); + const auto stop = pos + max_collision_length; + + for (; pos < stop; ++pos) + { + const auto cell_idx = pos & size_overlap_mask; + const auto & cell = cells[cell_idx]; + + if (cell.hash != hash || cell.key != key) + { + /// maybe we already found nearest expired cell + if (oldest_time > now && oldest_time > cell.expiresAt()) + { + oldest_time = cell.expiresAt(); + oldest_id = cell_idx; + } + + continue; + } + + if (cell.expiresAt() < now) + { + return {cell_idx, false, true}; + } + + return {cell_idx, true, false}; + } + + oldest_id &= size_overlap_mask; + return {oldest_id, false, false}; +} + + void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray & out) const { dict_struct.validateKeyTypes(key_types); @@ -181,11 +228,12 @@ void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, co /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } MapType> outdated_keys; - const auto rows = key_columns.front()->size(); + + const auto rows_num = key_columns.front()->size(); const auto keys_size = dict_struct.key.value().size(); StringRefs keys(keys_size); Arena temporary_keys_pool; - PODArray keys_array(rows); + PODArray keys_array(rows_num); size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; { @@ -193,31 +241,28 @@ void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, co const auto now = std::chrono::system_clock::now(); /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows)) + for (const auto row : ext::range(0, rows_num)) { const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool); keys_array[row] = key; - const auto hash = StringRefHash{}(key); - const size_t cell_idx = hash & (size - 1); - const auto & cell = cells[cell_idx]; - + const auto find_result = findCellIdx(key, now); + const auto & cell_idx = find_result.cell_idx; /** cell should be updated if either: * 1. keys (or hash) do not match, * 2. cell has expired, * 3. explicit defaults were specified and cell was set default. */ - if (cell.hash != hash || cell.key != key) - { - ++cache_not_found; - outdated_keys[key].push_back(row); - } - else if (cell.expiresAt() < now) + if (!find_result.valid) { - ++cache_expired; outdated_keys[key].push_back(row); + if (find_result.outdated) + ++cache_expired; + else + ++cache_not_found; } else { ++cache_hit; + const auto & cell = cells[cell_idx]; out[row] = !cell.isDefault(); } } @@ -226,8 +271,8 @@ void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, co ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release); + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); if (outdated_keys.empty()) return; @@ -376,11 +421,11 @@ void ComplexKeyCacheDictionary::getItemsNumberImpl( MapType> outdated_keys; auto & attribute_array = std::get>(attribute.arrays); - const auto rows = key_columns.front()->size(); + const auto rows_num = key_columns.front()->size(); const auto keys_size = dict_struct.key.value().size(); StringRefs keys(keys_size); Arena temporary_keys_pool; - PODArray keys_array(rows); + PODArray keys_array(rows_num); size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; { @@ -388,31 +433,30 @@ void ComplexKeyCacheDictionary::getItemsNumberImpl( const auto now = std::chrono::system_clock::now(); /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows)) + for (const auto row : ext::range(0, rows_num)) { const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool); keys_array[row] = key; - const auto hash = StringRefHash{}(key); - const size_t cell_idx = hash & (size - 1); - const auto & cell = cells[cell_idx]; + const auto find_result = findCellIdx(key, now); /** cell should be updated if either: * 1. keys (or hash) do not match, * 2. cell has expired, * 3. explicit defaults were specified and cell was set default. */ - if (cell.hash != hash || cell.key != key) - { - ++cache_not_found; - outdated_keys[key].push_back(row); - } - else if (cell.expiresAt() < now) + + if (!find_result.valid) { - ++cache_expired; outdated_keys[key].push_back(row); + if (find_result.outdated) + ++cache_expired; + else + ++cache_not_found; } else { ++cache_hit; + const auto & cell_idx = find_result.cell_idx; + const auto & cell = cells[cell_idx]; out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; } } @@ -420,9 +464,8 @@ void ComplexKeyCacheDictionary::getItemsNumberImpl( ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release); + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); if (outdated_keys.empty()) return; @@ -451,9 +494,9 @@ void ComplexKeyCacheDictionary::getItemsString( Attribute & attribute, const ConstColumnPlainPtrs & key_columns, ColumnString * out, DefaultGetter && get_default) const { - const auto rows = key_columns.front()->size(); + const auto rows_num = key_columns.front()->size(); /// save on some allocations - out->getOffsets().reserve(rows); + out->getOffsets().reserve(rows_num); const auto keys_size = dict_struct.key.value().size(); StringRefs keys(keys_size); @@ -469,21 +512,21 @@ void ComplexKeyCacheDictionary::getItemsString( const auto now = std::chrono::system_clock::now(); /// fetch up-to-date values, discard on fail - for (const auto row : ext::range(0, rows)) + for (const auto row : ext::range(0, rows_num)) { const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool); SCOPE_EXIT(temporary_keys_pool.rollback(key.size)); - const auto hash = StringRefHash{}(key); - const size_t cell_idx = hash & (size - 1); - const auto & cell = cells[cell_idx]; + const auto find_result = findCellIdx(key, now); - if (cell.hash != hash || cell.key != key || cell.expiresAt() < now) + if (!find_result.valid) { found_outdated_values = true; break; } else { + const auto & cell_idx = find_result.cell_idx; + const auto & cell = cells[cell_idx]; const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; out->insertData(string_ref.data, string_ref.size); } @@ -493,8 +536,8 @@ void ComplexKeyCacheDictionary::getItemsString( /// optimistic code completed successfully if (!found_outdated_values) { - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows, std::memory_order_release); + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num, std::memory_order_release); return; } @@ -506,7 +549,7 @@ void ComplexKeyCacheDictionary::getItemsString( MapType> outdated_keys; /// we are going to store every string separately MapType map; - PODArray keys_array(rows); + PODArray keys_array(rows_num); size_t total_length = 0; size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; @@ -514,27 +557,25 @@ void ComplexKeyCacheDictionary::getItemsString( const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; const auto now = std::chrono::system_clock::now(); - for (const auto row : ext::range(0, rows)) + for (const auto row : ext::range(0, rows_num)) { const StringRef key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool); keys_array[row] = key; - const auto hash = StringRefHash{}(key); - const size_t cell_idx = hash & (size - 1); - const auto & cell = cells[cell_idx]; + const auto find_result = findCellIdx(key, now); - if (cell.hash != hash || cell.key != key) - { - ++cache_not_found; - outdated_keys[key].push_back(row); - } - else if (cell.expiresAt() < now) + if (!find_result.valid) { - ++cache_expired; outdated_keys[key].push_back(row); + if (find_result.outdated) + ++cache_expired; + else + ++cache_not_found; } else { ++cache_hit; + const auto & cell_idx = find_result.cell_idx; + const auto & cell = cells[cell_idx]; const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; if (!cell.isDefault()) @@ -548,8 +589,8 @@ void ComplexKeyCacheDictionary::getItemsString( ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release); + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); /// request new values if (!outdated_keys.empty()) @@ -614,6 +655,7 @@ void ComplexKeyCacheDictionary::update( StringRefs keys(keys_size); const auto attributes_size = attributes.size(); + const auto now = std::chrono::system_clock::now(); while (const auto block = stream->read()) { @@ -632,13 +674,14 @@ void ComplexKeyCacheDictionary::update( return block.safeGetByPosition(keys_size + attribute_idx).column.get(); }); - const auto rows = block.rows(); + const auto rows_num = block.rows(); - for (const auto row : ext::range(0, rows)) + for (const auto row : ext::range(0, rows_num)) { auto key = allocKey(row, key_columns, keys); const auto hash = StringRefHash{}(key); - const size_t cell_idx = hash & (size - 1); + const auto find_result = findCellIdx(key, now, hash); + const auto & cell_idx = find_result.cell_idx; auto & cell = cells[cell_idx]; for (const auto attribute_idx : ext::range(0, attributes.size())) @@ -691,6 +734,8 @@ void ComplexKeyCacheDictionary::update( size_t found_num = 0; size_t not_found_num = 0; + const auto now = std::chrono::system_clock::now(); + /// Check which ids have not been found and require setting null_value for (const auto key_found_pair : remaining_keys) { @@ -704,7 +749,8 @@ void ComplexKeyCacheDictionary::update( auto key = key_found_pair.first; const auto hash = StringRefHash{}(key); - const size_t cell_idx = hash & (size - 1); + const auto find_result = findCellIdx(key, now, hash); + const auto & cell_idx = find_result.cell_idx; auto & cell = cells[cell_idx]; /// Set null_value for each attribute -- GitLab