diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 0a6d5464341c58009f33fd5fe4003739db562408..61c3d71cbbc7b2829f961de61feebfaf3b75f823 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -1,14 +1,17 @@ #pragma once -#include -#include -#include -#include -#include + + +#include #include -#include #include #include -#include + +#include +#include +#include + +#include +#include namespace DB { @@ -32,118 +35,12 @@ public: using HashMethodContextPtr = std::shared_ptr; -template -struct MappedTraits -{ - using Type = void *; - static Type getMapped(T &) { return nullptr; } - static T & getKey(T & key) { return key; } -}; - -template -struct MappedTraits> -{ - using Type = Second *; - static Type getMapped(PairNoInit & value) { return &value.second; } - static First & getKey(PairNoInit & value) { return value.first; } -}; - -template -struct HashTableTraits -{ - using Value = typename Data::value_type; - using Mapped = typename MappedTraits::Type; - - static Mapped getMapped(Value & value) { return MappedTraits::getMapped(value); } - static auto & getKey(Value & value) { return MappedTraits::getKey(value); } -}; - -template -struct LastElementCache -{ - static constexpr bool consecutive_keys_optimization = consecutive_keys_optimization_; - using Value = typename HashTableTraits::Value; - Value value; - bool empty = true; - bool found = false; - - auto getMapped() { return HashTableTraits::getMapped(value); } - auto & getKey() { return HashTableTraits::getKey(value); } -}; - -template -struct LastElementCache -{ - static constexpr bool consecutive_keys_optimization = false; -}; - -template -inline ALWAYS_INLINE typename HashTableTraits::Value & emplaceKeyImpl( - Key key, Data & data, bool & inserted, Cache & cache [[maybe_unused]]) -{ - if constexpr (Cache::consecutive_keys_optimization) - { - if (!cache.empty && cache.found && cache.getKey() == key) - { - inserted = false; - return cache.value; - } - } - - typename Data::iterator it; - data.emplace(key, it, inserted); - auto & value = *it; - - if constexpr (Cache::consecutive_keys_optimization) - { - cache.value = value; - cache.empty = false; - cache.found = true; - } - - return value; -} - -template -inline ALWAYS_INLINE typename HashTableTraits::Mapped findKeyImpl( - Key key, Data & data, bool & found, Cache & cache [[maybe_unused]]) -{ - if constexpr (Cache::consecutive_keys_optimization) - { - if (!cache.empty && cache.getKey() == key) - { - found = cache.found; - return found ? cache.getMapped() : nullptr; - } - } - - auto it = data.find(key); - - found = it != data.end(); - auto mapped = found ? HashTableTraits::getMapped(*it) - : nullptr; - - if constexpr (Cache::consecutive_keys_optimization) - { - if (found) - cache.value = *it; - else - cache.getKey() = key; - - cache.empty = false; - cache.found = found; - } - - return mapped; -} - - /// For the case where there is one numeric key. -template /// UInt8/16/32/64 for any type with corresponding bit width. -struct HashMethodOneNumber +template /// UInt8/16/32/64 for any type with corresponding bit width. +struct HashMethodOneNumber : public columns_hashing_impl::HashMethodBase { + using Base = columns_hashing_impl::HashMethodBase; const char * vec; - LastElementCache last_elem_cache; /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) @@ -158,27 +55,20 @@ struct HashMethodOneNumber /// Emplace key into HashTable or HashMap. If Data is HashMap, returns ptr to value, otherwise nullptr. template - ALWAYS_INLINE typename HashTableTraits::Mapped emplaceKey( + ALWAYS_INLINE typename Base::EmplaceResult emplaceKey( Data & data, /// HashTable size_t row, /// From which row of the block insert the key - bool & inserted, Arena & /*pool*/) /// For Serialized method, key may be placed in pool. { - return HashTableTraits::getMapped(emplaceKeyImpl(getKey(row), data, inserted, last_elem_cache)); + typename Data::iterator it; + return Base::emplaceKeyImpl(getKey(row), data, it); } /// Find key into HashTable or HashMap. If Data is HashMap and key was found, returns ptr to value, otherwise nullptr. template - ALWAYS_INLINE typename HashTableTraits::Mapped findKey(Data & data, size_t row, bool & found, Arena & /*pool*/) - { - return findKeyImpl(getKey(row), data, found, last_elem_cache); - } - - /// Insert the key from the hash table into columns. - template - static void insertKeyIntoColumns(const Value & value, MutableColumns & key_columns, const Sizes & /*key_sizes*/) + ALWAYS_INLINE typename Base::FindResult findKey(Data & data, size_t row, Arena & /*pool*/) { - static_cast(key_columns[0].get())->insertRawData(reinterpret_cast(&value.first)); + return Base::findKeyImpl(getKey(row), data); } /// Get hash value of row. @@ -189,34 +79,24 @@ struct HashMethodOneNumber } /// Get StringRef from value which can be inserted into column. - template static StringRef getValueRef(const Value & value) { return StringRef(reinterpret_cast(&value.first), sizeof(value.first)); } - /// Cache last result if key was inserted. - template - ALWAYS_INLINE void cacheData(size_t /*row*/, Mapped mapped) - { - *last_elem_cache.getMapped() = mapped; - } - protected: - template static ALWAYS_INLINE void onNewKey(Value & /*value*/, Arena & /*pool*/) {} }; /// For the case where there is one string key. -template -struct HashMethodString +template +struct HashMethodString : public columns_hashing_impl::HashMethodBase { + using Base = columns_hashing_impl::HashMethodBase; const IColumn::Offset * offsets; const UInt8 * chars; - LastElementCache last_elem_cache; - HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { const IColumn & column = *key_columns[0]; @@ -230,28 +110,23 @@ struct HashMethodString StringRef getKey(size_t row) const { return StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); } template - ALWAYS_INLINE typename HashTableTraits::Mapped emplaceKey(Data & data, size_t row, bool & inserted, Arena & pool) + ALWAYS_INLINE typename Base::EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool) { - auto & value = emplaceKeyImpl(getKey(row), data, inserted, last_elem_cache); - if (inserted) + auto key = getKey(row); + typename Data::iterator it; + auto result = Base::emplaceKeyImpl(key, data, it); + if (result.isInserted()) { - auto & key = HashTableTraits::getKey(value); if (key.size) - key.data = pool.insert(key.data, key.size); + it->first.data = pool.insert(key.data, key.size); } - return HashTableTraits::getMapped(value); + return result; } template - ALWAYS_INLINE typename HashTableTraits::Mapped findKey(Data & data, size_t row, bool & found, Arena & /*pool*/) - { - return findKeyImpl(getKey(row), data, found, last_elem_cache); - } - - template - static void insertKeyIntoColumns(const Value & value, MutableColumns & key_columns, const Sizes & /*key_sizes*/) + ALWAYS_INLINE typename Base::FindResult findKey(Data & data, size_t row, Arena & /*pool*/) { - key_columns[0]->insertData(value.first.data, value.first.size); + return Base::findKeyImpl(getKey(row), data); } template @@ -260,20 +135,12 @@ struct HashMethodString return data.hash(getKey(row)); } - template static StringRef getValueRef(const Value & value) { return StringRef(value.first.data, value.first.size); } - template - ALWAYS_INLINE void cacheData(size_t /*row*/, Mapped mapped) - { - *last_elem_cache.getMapped() = mapped; - } - protected: - template static ALWAYS_INLINE void onNewKey(Value & value, Arena & pool) { if (value.first.size) @@ -283,14 +150,13 @@ protected: /// For the case where there is one fixed-length string key. -template -struct HashMethodFixedString +template +struct HashMethodFixedString : public columns_hashing_impl::HashMethodBase { + using Base = columns_hashing_impl::HashMethodBase; size_t n; const ColumnFixedString::Chars * chars; - LastElementCache last_elem_cache; - HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { const IColumn & column = *key_columns[0]; @@ -304,27 +170,21 @@ struct HashMethodFixedString StringRef getKey(size_t row) const { return StringRef(&(*chars)[row * n], n); } template - ALWAYS_INLINE typename HashTableTraits::Mapped emplaceKey(Data & data, size_t row, bool & inserted, Arena & pool) + ALWAYS_INLINE typename Base::EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool) { - auto & value = emplaceKeyImpl(getKey(row), data, inserted, last_elem_cache); - if (inserted) - { - auto & key = HashTableTraits::getKey(value); - key.data = pool.insert(key.data, key.size); - } - return HashTableTraits::getMapped(value); - } + auto key = getKey(row); + typename Data::iterator it; + auto res = Base::emplaceKeyImpl(key, data, it); + if (res.isInserted()) + it->first.data = pool.insert(key.data, key.size); - template - ALWAYS_INLINE typename HashTableTraits::Mapped findKey(Data & data, size_t row, bool & found, Arena & /*pool*/) - { - return findKeyImpl(getKey(row), data, found, last_elem_cache); + return res; } - template - static void insertKeyIntoColumns(const Value & value, MutableColumns & key_columns, const Sizes & /*key_sizes*/) + template + ALWAYS_INLINE typename Base::FindResult findKey(Data & data, size_t row, Arena & /*pool*/) { - key_columns[0]->insertData(value.first.data, value.first.size); + return Base::findKeyImpl(getKey(row), data); } template @@ -333,20 +193,12 @@ struct HashMethodFixedString return data.hash(getKey(row)); } - template static StringRef getValueRef(const Value & value) { return StringRef(value.first.data, value.first.size); } - template - ALWAYS_INLINE void cacheData(size_t /*row*/, Mapped mapped) - { - *last_elem_cache.getMapped() = mapped; - } - protected: - template static ALWAYS_INLINE void onNewKey(Value & value, Arena & pool) { value.first.data = pool.insert(value.first.data, value.first.size); @@ -400,12 +252,24 @@ private: Cache cache; }; + /// Single low cardinality column. -template +template struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod { using Base = SingleColumnMethod; + enum class VisitValue + { + Empty = 0, + Found = 1, + NotFound = 2, + }; + + static constexpr bool has_mapped = !std::is_same::value; + using EmplaceResult = columns_hashing_impl::EmplaceResultImpl; + using FindResult = columns_hashing_impl::FindResultImpl; + static HashMethodContextPtr createContext(const HashMethodContext::Settings & settings) { return std::make_shared(settings); @@ -421,7 +285,8 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod ColumnPtr dictionary_holder; /// Cache AggregateDataPtr for current column in order to decrease the number of hash table usages. - PaddedPODArray aggregate_data_cache; + columns_hashing_impl::MappedCache mapped_cache; + PaddedPODArray visit_cache; /// If initialized column is nullable. bool is_nullable = false; @@ -495,8 +360,11 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod } } - AggregateDataPtr default_data = nullptr; - aggregate_data_cache.assign(key_columns[0]->size(), default_data); + if constexpr (has_mapped) + mapped_cache.resize(key_columns[0]->size()); + + VisitValue empty(VisitValue::Empty); + visit_cache.assign(key_columns[0]->size(), empty); size_of_index_type = column->getSizeOfIndexType(); positions = column->getIndexesPtr().get(); @@ -521,41 +389,45 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod } template - ALWAYS_INLINE typename HashTableTraits::Mapped emplaceKey(Data & data, size_t row_, bool & inserted, Arena & pool) + ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row_, Arena & pool) { size_t row = getIndexAt(row_); if (is_nullable && row == 0) { - inserted = !data.hasNullKeyData(); - data.hasNullKeyData() = true; - return &data.getNullKeyData(); + visit_cache[row] = VisitValue::Found; + if constexpr (has_mapped) + return EmplaceResult(data.getNullKeyData(), mapped_cache[0], !data.hasNullKeyData()); + else + return EmplaceResult(!data.hasNullKeyData()); } - if constexpr (use_cache) + if (visit_cache[row] == VisitValue::Found) { - if (aggregate_data_cache[row]) - { - inserted = false; - return &aggregate_data_cache[row]; - } + if constexpr (has_mapped) + return EmplaceResult(mapped_cache[row], mapped_cache[row], false); + else + return EmplaceResult(false); } - Sizes key_sizes; auto key = getKey(row_); + bool inserted = false; typename Data::iterator it; if (saved_hash) data.emplace(key, it, inserted, saved_hash[row]); else data.emplace(key, it, inserted); + visit_cache[row] = VisitValue::Found; + if (inserted) Base::onNewKey(*it, pool); - else if constexpr (use_cache) - aggregate_data_cache[row] = it->second; - return HashTableTraits::getMapped(*it); + if constexpr (has_mapped) + return EmplaceResult(it->second, mapped_cache[row], inserted); + else + return EmplaceResult(inserted); } ALWAYS_INLINE bool isNullAt(size_t i) @@ -566,25 +438,25 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod return getIndexAt(i) == 0; } - template - ALWAYS_INLINE void cacheData(size_t i, Mapped mapped) - { - size_t row = getIndexAt(i); - aggregate_data_cache[row] = mapped; - } - template - ALWAYS_INLINE typename HashTableTraits::Mapped findFromRow(Data & data, size_t row_, bool & found, Arena &) + ALWAYS_INLINE FindResult findFromRow(Data & data, size_t row_, Arena &) { size_t row = getIndexAt(row_); if (is_nullable && row == 0) - return data.hasNullKeyData() ? &data.getNullKeyData() : nullptr; + { + if constexpr (has_mapped) + return FindResult(data.hasNullKeyData() ? data.getNullKeyData() : Mapped(), data.hasNullKeyData()); + else + return FindResult(data.hasNullKeyData()); + } - if constexpr (use_cache) + if (visit_cache[row] != VisitValue::Empty) { - if (aggregate_data_cache[row]) - return &aggregate_data_cache[row]; + if constexpr (has_mapped) + return FindResult(mapped_cache[row], visit_cache[row] == VisitValue::Found); + else + return FindResult(visit_cache[row] == VisitValue::Found); } auto key = getKey(row_); @@ -595,14 +467,19 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod else it = data.find(key); - found = it != data.end(); - if constexpr (use_cache) + bool found = it != data.end(); + visit_cache[row] = found ? VisitValue::Found : VisitValue::NotFound; + + if constexpr (has_mapped) { if (found) - aggregate_data_cache[row] = it->second; + mapped_cache[row] = it->second; } - return typename HashTableTraits::getMapped(*it); + if constexpr (has_mapped) + return FindResult(mapped_cache[row], found); + else + return FindResult(found); } template @@ -614,107 +491,8 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod return Base::getHash(data, row, pool); } - - template - static void insertKeyIntoColumns(const Value & value, MutableColumns & key_columns_low_cardinality, const Sizes & /*key_sizes*/) - { - auto ref = Base::getValueRef(value); - static_cast(key_columns_low_cardinality[0].get())->insertData(ref.data, ref.size); - } -}; - - -namespace columns_hashing_impl -{ - -/// This class is designed to provide the functionality that is required for -/// supporting nullable keys in HashMethodKeysFixed. If there are -/// no nullable keys, this class is merely implemented as an empty shell. -template -class BaseStateKeysFixed; - -/// Case where nullable keys are supported. -template -class BaseStateKeysFixed -{ -protected: - void init(const ColumnRawPtrs & key_columns) - { - null_maps.reserve(key_columns.size()); - actual_columns.reserve(key_columns.size()); - - for (const auto & col : key_columns) - { - if (col->isColumnNullable()) - { - const auto & nullable_col = static_cast(*col); - actual_columns.push_back(&nullable_col.getNestedColumn()); - null_maps.push_back(&nullable_col.getNullMapColumn()); - } - else - { - actual_columns.push_back(col); - null_maps.push_back(nullptr); - } - } - } - - /// Return the columns which actually contain the values of the keys. - /// For a given key column, if it is nullable, we return its nested - /// column. Otherwise we return the key column itself. - inline const ColumnRawPtrs & getActualColumns() const - { - return actual_columns; - } - - /// Create a bitmap that indicates whether, for a particular row, - /// a key column bears a null value or not. - KeysNullMap createBitmap(size_t row) const - { - KeysNullMap bitmap{}; - - for (size_t k = 0; k < null_maps.size(); ++k) - { - if (null_maps[k] != nullptr) - { - const auto & null_map = static_cast(*null_maps[k]).getData(); - if (null_map[row] == 1) - { - size_t bucket = k / 8; - size_t offset = k % 8; - bitmap[bucket] |= UInt8(1) << offset; - } - } - } - - return bitmap; - } - -private: - ColumnRawPtrs actual_columns; - ColumnRawPtrs null_maps; -}; - -/// Case where nullable keys are not supported. -template -class BaseStateKeysFixed -{ -protected: - void init(const ColumnRawPtrs & columns) { actual_columns = columns; } - - const ColumnRawPtrs & getActualColumns() const { return actual_columns; } - - KeysNullMap createBitmap(size_t) const - { - throw Exception{"Internal error: calling createBitmap() for non-nullable keys" - " is forbidden", ErrorCodes::LOGICAL_ERROR}; - } - -private: - ColumnRawPtrs actual_columns; }; -} // Optional mask for low cardinality columns. template @@ -729,11 +507,11 @@ template <> struct LowCardinalityKeys {}; /// For the case where all keys are of fixed length, and they fit in N (for example, 128) bits. -template -struct HashMethodKeysFixed : private columns_hashing_impl::BaseStateKeysFixed +template +struct HashMethodKeysFixed + : private columns_hashing_impl::BaseStateKeysFixed + , public columns_hashing_impl::HashMethodBase { - using Key = typename TData::key_type; - static constexpr bool has_nullable_keys = has_nullable_keys_; static constexpr bool has_low_cardinality = has_low_cardinality_; @@ -741,9 +519,8 @@ struct HashMethodKeysFixed : private columns_hashing_impl::BaseStateKeysFixed last_elem_cache; - using Base = columns_hashing_impl::BaseStateKeysFixed; + using BaseHashed = columns_hashing_impl::HashMethodBase; HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes, const HashMethodContextPtr &) : key_sizes(std::move(key_sizes)), keys_size(key_columns.size()) @@ -789,21 +566,16 @@ struct HashMethodKeysFixed : private columns_hashing_impl::BaseStateKeysFixed - ALWAYS_INLINE typename HashTableTraits::Mapped emplaceKey(Data & data, size_t row, bool & inserted, Arena & /*pool*/) + ALWAYS_INLINE typename BaseHashed::EmplaceResult emplaceKey(Data & data, size_t row, Arena & /*pool*/) { - return HashTableTraits::getMapped(emplaceKeyImpl(getKey(row), data, inserted, last_elem_cache)); + typename Data::iterator it; + return BaseHashed::emplaceKeyImpl(getKey(row), data, it); } template - ALWAYS_INLINE typename HashTableTraits::Mapped findKey(Data & data, size_t row, bool & found, Arena & /*pool*/) + ALWAYS_INLINE typename BaseHashed::FindResult findKey(Data & data, size_t row, Arena & /*pool*/) { - return findKeyImpl(getKey(row), data, found, last_elem_cache); - } - - template - static StringRef getValueRef(const Value & value) - { - return StringRef(value.first.data, value.first.size); + return BaseHashed::findKeyImpl(getKey(row), data); } template @@ -811,12 +583,6 @@ struct HashMethodKeysFixed : private columns_hashing_impl::BaseStateKeysFixed - ALWAYS_INLINE void cacheData(size_t /*row*/, Mapped mapped) - { - *last_elem_cache.getMapped() = mapped; - } }; /** Hash by concatenating serialized key values. @@ -824,12 +590,12 @@ struct HashMethodKeysFixed : private columns_hashing_impl::BaseStateKeysFixed -struct HashMethodSerialized +template +struct HashMethodSerialized : public columns_hashing_impl::HashMethodBase { + using Base = columns_hashing_impl::HashMethodBase; ColumnRawPtrs key_columns; size_t keys_size; - LastElementCache last_elem_cache; HashMethodSerialized(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) : key_columns(key_columns), keys_size(key_columns.size()) {} @@ -837,24 +603,25 @@ struct HashMethodSerialized static HashMethodContextPtr createContext(const HashMethodContext::Settings &) { return nullptr; } template - ALWAYS_INLINE typename HashTableTraits::Mapped emplaceKey(Data & data, size_t row, bool & inserted, Arena & pool) + ALWAYS_INLINE typename Base::EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool) { auto key = getKey(row, pool); - auto & value = emplaceKeyImpl(key, data, inserted, last_elem_cache); - if (!inserted) + typename Data::iterator it; + auto res = Base::emplaceKeyImpl(key, data, it); + if (!res.isInserted()) pool.rollback(key.size); - return HashTableTraits::getMapped(value); + return res; } template - ALWAYS_INLINE typename HashTableTraits::Mapped findKey(Data & data, size_t row, bool & found, Arena & pool) + ALWAYS_INLINE typename Base::FindResult findKey(Data & data, size_t row, Arena & pool) { auto key = getKey(row, pool); - auto mapped = findKeyImpl(key, data, found, last_elem_cache); + auto res = Base::findKeyImpl(key, data); pool.rollback(key.size); - return mapped; + return res; } template @@ -867,9 +634,6 @@ struct HashMethodSerialized return hash; } - template - ALWAYS_INLINE void cacheData(size_t /*row*/, Mapped /*mapped*/) {} - protected: ALWAYS_INLINE StringRef getKey(size_t row, Arena & pool) const { diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..565940b33385f2057c99359e6122fdca3717619b --- /dev/null +++ b/dbms/src/Common/ColumnsHashingImpl.h @@ -0,0 +1,276 @@ +#pragma once + +#include +#include + +namespace DB +{ + +namespace ColumnsHashing +{ + +namespace columns_hashing_impl +{ + +template +struct LastElementCache +{ + static constexpr bool consecutive_keys_optimization = consecutive_keys_optimization_; + Value value; + bool empty = true; + bool found = false; + + bool check(const Value & value_) { return !empty && value == value_; } + + template + bool check(const Key & key) { return !empty && value.first == key; } +}; + +template +struct LastElementCache +{ + static constexpr bool consecutive_keys_optimization = false; +}; + +template +class EmplaceResultImpl +{ + Mapped & value; + Mapped & cached_value; + bool inserted; + +public: + EmplaceResultImpl(Mapped & value, Mapped & cached_value, bool inserted) + : value(value), cached_value(cached_value), inserted(inserted) {} + + bool isInserted() const { return inserted; } + const auto & getMapped() const { return value; } + void setMapped(const Mapped & mapped) { value = cached_value = mapped; } +}; + +template <> +class EmplaceResultImpl +{ + bool inserted; + +public: + explicit EmplaceResultImpl(bool inserted) : inserted(inserted) {} + bool isInserted() const { return inserted; } +}; + +template +class FindResultImpl +{ + Mapped value; + bool found; + +public: + FindResultImpl(Mapped value, bool found) : value(value), found(found) {} + bool isFound() const { return found; } + const Mapped & getMapped() const { return value; } +}; + +template <> +class FindResultImpl +{ + bool found; + +public: + explicit FindResultImpl(bool found) : found(found) {} + bool isFound() const { return found; } +}; + +template +struct HashMethodBase +{ + using EmplaceResult = EmplaceResultImpl; + using FindResult = FindResultImpl; + static constexpr bool has_mapped = !std::is_same::value; + using Cache = LastElementCache; + +protected: + Cache cache; + + HashMethodBase() + { + if constexpr (has_mapped && consecutive_keys_optimization) + { + /// Init PairNoInit elements. + cache.value.second = Mapped(); + using Key = decltype(cache.value.first); + cache.value.first = Key(); + } + } + + template + ALWAYS_INLINE EmplaceResult emplaceKeyImpl(Key key, Data & data, typename Data::iterator & it) + { + if constexpr (Cache::consecutive_keys_optimization) + { + if (cache.found && cache.check(key)) + { + if constexpr (has_mapped) + return EmplaceResult(cache.value.second, cache.value.second, false); + else + return EmplaceResult(false); + } + } + + bool inserted = false; + data.emplace(key, it, inserted); + Mapped * cached = &it->second; + + if constexpr (consecutive_keys_optimization) + { + cache.value = *it; + cache.found = true; + cache.empty = false; + cached = &cache.value.second; + } + + if constexpr (has_mapped) + return EmplaceResult(it->second, *cached, inserted); + else + return EmplaceResult(inserted); + } + + template + ALWAYS_INLINE FindResult findKeyImpl(Key key, Data & data) + { + if constexpr (Cache::consecutive_keys_optimization) + { + if (cache.check(key)) + { + if constexpr (has_mapped) + return FindResult(cache.found ? cache.value.second : Mapped(), cache.found); + else + return FindResult(cache.found); + } + } + + auto it = data.find(key); + bool found = it != data.end(); + + if constexpr (consecutive_keys_optimization) + { + cache.found = found; + cache.empty = false; + + if (found) + cache.value = *it; + else + { + if constexpr (has_mapped) + cache.value.first = key; + else + cache.value = key; + } + } + + if constexpr (has_mapped) + return FindResult(found ? it->second : Mapped(), found); + else + return FindResult(found); + } +}; + + +template +struct MappedCache : public PaddedPODArray {}; + +template <> +struct MappedCache {}; + + +/// This class is designed to provide the functionality that is required for +/// supporting nullable keys in HashMethodKeysFixed. If there are +/// no nullable keys, this class is merely implemented as an empty shell. +template +class BaseStateKeysFixed; + +/// Case where nullable keys are supported. +template +class BaseStateKeysFixed +{ +protected: + void init(const ColumnRawPtrs & key_columns) + { + null_maps.reserve(key_columns.size()); + actual_columns.reserve(key_columns.size()); + + for (const auto & col : key_columns) + { + if (col->isColumnNullable()) + { + const auto & nullable_col = static_cast(*col); + actual_columns.push_back(&nullable_col.getNestedColumn()); + null_maps.push_back(&nullable_col.getNullMapColumn()); + } + else + { + actual_columns.push_back(col); + null_maps.push_back(nullptr); + } + } + } + + /// Return the columns which actually contain the values of the keys. + /// For a given key column, if it is nullable, we return its nested + /// column. Otherwise we return the key column itself. + inline const ColumnRawPtrs & getActualColumns() const + { + return actual_columns; + } + + /// Create a bitmap that indicates whether, for a particular row, + /// a key column bears a null value or not. + KeysNullMap createBitmap(size_t row) const + { + KeysNullMap bitmap{}; + + for (size_t k = 0; k < null_maps.size(); ++k) + { + if (null_maps[k] != nullptr) + { + const auto & null_map = static_cast(*null_maps[k]).getData(); + if (null_map[row] == 1) + { + size_t bucket = k / 8; + size_t offset = k % 8; + bitmap[bucket] |= UInt8(1) << offset; + } + } + } + + return bitmap; + } + +private: + ColumnRawPtrs actual_columns; + ColumnRawPtrs null_maps; +}; + +/// Case where nullable keys are not supported. +template +class BaseStateKeysFixed +{ +protected: + void init(const ColumnRawPtrs & columns) { actual_columns = columns; } + + const ColumnRawPtrs & getActualColumns() const { return actual_columns; } + + KeysNullMap createBitmap(size_t) const + { + throw Exception{"Internal error: calling createBitmap() for non-nullable keys" + " is forbidden", ErrorCodes::LOGICAL_ERROR}; + } + +private: + ColumnRawPtrs actual_columns; +}; + +} + +} + +} diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 785345f9400e939ae603864449b4edcd056b0390..e26d94f53e416a93cea76ca5b9380a126ed3964f 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -609,20 +609,34 @@ void NO_INLINE Aggregator::executeImplCase( /// NOTE When editing this code, also pay attention to SpecializedAggregator.h. /// For all rows. - AggregateDataPtr value = nullptr; for (size_t i = 0; i < rows; ++i) { - bool inserted = false; /// Inserted a new key, or was this key already? - - AggregateDataPtr * aggregate_data = nullptr; + AggregateDataPtr aggregate_data = nullptr; if constexpr (!no_more_keys) /// Insert. - aggregate_data = state.emplaceKey(method.data, i, inserted, *aggregates_pool); + { + auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool); + + /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key. + if (emplace_result.isInserted()) + { + /// exception-safety - if you can not allocate memory or create states, then destructors will not be called. + emplace_result.setMapped(nullptr); + + aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); + createAggregateStates(aggregate_data); + + emplace_result.setMapped(aggregate_data); + } + else + aggregate_data = emplace_result.getMapped(); + } else { /// Add only if the key already exists. - bool found = false; - aggregate_data = state.findKey(method.data, i, found, *aggregates_pool); + auto find_result = state.findKey(method.data, i, *aggregates_pool); + if (find_result.isFound()) + aggregate_data = find_result.getMapped(); } /// aggregate_date == nullptr means that the new key did not fit in the hash table because of no_more_keys. @@ -631,20 +645,7 @@ void NO_INLINE Aggregator::executeImplCase( if (!aggregate_data && !overflow_row) continue; - /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key. - if (inserted) - { - /// exception-safety - if you can not allocate memory or create states, then destructors will not be called. - *aggregate_data = nullptr; - - AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); - createAggregateStates(place); - *aggregate_data = place; - - state.cacheData(i, place); - } - - value = aggregate_data ? *aggregate_data : overflow_row; + AggregateDataPtr value = aggregate_data ? aggregate_data : overflow_row; /// Add values to the aggregate functions. for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst) @@ -1951,17 +1952,28 @@ void NO_INLINE Aggregator::mergeStreamsImplCase( size_t rows = block.rows(); for (size_t i = 0; i < rows; ++i) { - typename Table::iterator it; - AggregateDataPtr * aggregate_data = nullptr; - - bool inserted = false; /// Inserted a new key, or was this key already? + AggregateDataPtr aggregate_data = nullptr; if (!no_more_keys) - aggregate_data = state.emplaceKey(data, i, inserted, *aggregates_pool); + { + auto emplace_result = state.emplaceKey(data, i, *aggregates_pool); + if (emplace_result.isInserted()) + { + emplace_result.setMapped(nullptr); + + aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); + createAggregateStates(aggregate_data); + + emplace_result.setMapped(aggregate_data); + } + else + aggregate_data = emplace_result.getMapped(); + } else { - bool found; - aggregate_data = state.findKey(data, i, found, *aggregates_pool); + auto find_result = state.findKey(data, i, *aggregates_pool); + if (find_result.isFound()) + aggregate_data = find_result.getMapped(); } /// aggregate_date == nullptr means that the new key did not fit in the hash table because of no_more_keys. @@ -1970,19 +1982,7 @@ void NO_INLINE Aggregator::mergeStreamsImplCase( if (!aggregate_data && !overflow_row) continue; - /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key. - if (inserted) - { - *aggregate_data = nullptr; - - AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); - createAggregateStates(place); - *aggregate_data = place; - - state.cacheData(i, place); - } - - AggregateDataPtr value = aggregate_data ? *aggregate_data : overflow_row; + AggregateDataPtr value = aggregate_data ? aggregate_data : overflow_row; /// Merge state of aggregate functions. for (size_t j = 0; j < params.aggregates_size; ++j) diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index 0b40f4e6a25f23980926fe25e37ce2a9004aacbf..9112e0265cb0fe2b319e03d094b54fff23af5402 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -158,7 +158,7 @@ struct AggregationMethodOneNumber AggregationMethodOneNumber(const Other & other) : data(other.data) {} /// To use one `Method` in different threads, use different `State`. - using State = ColumnsHashing::HashMethodOneNumber; + using State = ColumnsHashing::HashMethodOneNumber; /// Use optimization for low cardinality. static const bool low_cardinality_optimization = false; @@ -188,7 +188,7 @@ struct AggregationMethodString template AggregationMethodString(const Other & other) : data(other.data) {} - using State = ColumnsHashing::HashMethodString; + using State = ColumnsHashing::HashMethodString; static const bool low_cardinality_optimization = false; @@ -216,7 +216,7 @@ struct AggregationMethodFixedString template AggregationMethodFixedString(const Other & other) : data(other.data) {} - using State = ColumnsHashing::HashMethodFixedString; + using State = ColumnsHashing::HashMethodFixedString; static const bool low_cardinality_optimization = false; @@ -246,7 +246,7 @@ struct AggregationMethodSingleLowCardinalityColumn : public SingleColumnMethod template explicit AggregationMethodSingleLowCardinalityColumn(const Other & other) : Base(other) {} - using State = ColumnsHashing::HashMethodSingleLowCardinalityColumn; + using State = ColumnsHashing::HashMethodSingleLowCardinalityColumn; static const bool low_cardinality_optimization = true; @@ -277,7 +277,7 @@ struct AggregationMethodKeysFixed template AggregationMethodKeysFixed(const Other & other) : data(other.data) {} - using State = ColumnsHashing::HashMethodKeysFixed; + using State = ColumnsHashing::HashMethodKeysFixed; static const bool low_cardinality_optimization = false; @@ -355,7 +355,7 @@ struct AggregationMethodSerialized template AggregationMethodSerialized(const Other & other) : data(other.data) {} - using State = ColumnsHashing::HashMethodSerialized; + using State = ColumnsHashing::HashMethodSerialized; static const bool low_cardinality_optimization = false;