ColumnUnique.h 12.1 KB
Newer Older
1
#pragma once
2 3
#include <Columns/IColumnUnique.h>
#include <Common/HashTable/HashMap.h>
4 5 6 7
#include <ext/range.h>
#include <Common/typeid_cast.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnNullable.h>
8
#include <Columns/ColumnString.h>
9

10
class NullMap;
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46


template <typename ColumnType>
struct StringRefWrapper
{
    const ColumnType * column = nullptr;
    size_t row = 0;

    StringRef ref;

    StringRefWrapper(const ColumnType * column, size_t row) : column(column), row(row) {}
    StringRefWrapper(StringRef ref) : ref(ref) {}
    StringRefWrapper(const StringRefWrapper & other) = default;
    StringRefWrapper & operator =(int) { column = nullptr; return *this; }
    bool operator ==(int) const { return nullptr == column; }
    StringRefWrapper() {}

    operator StringRef() const { return column ? column->getDataAt(row) : ref; }

    bool operator==(const StringRefWrapper<ColumnType> & other) const
    {
        return (column && column == other.column && row == other.row) || StringRef(*this) == other;
    }

};

namespace ZeroTraits
{
    template <typename ColumnType>
    bool check(const StringRefWrapper<ColumnType> x) { return nullptr == x.column; }

    template <typename ColumnType>
    void set(StringRefWrapper<ColumnType> & x) { x.column = nullptr; }
};


47 48 49
namespace DB
{

50
template <typename ColumnType, typename IndexType>
51
class ColumnUnique final : public COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType, IndexType>>
52
{
53
    friend class COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType, IndexType>>;
54 55

private:
56 57 58 59 60
    explicit ColumnUnique(MutableColumnPtr && holder);
    explicit ColumnUnique(const DataTypePtr & type) : is_nullable(type->isNullable())
    {
        column_holder = removeNullable(type)->createColumn()->cloneResized(numSpecialValues());
    }
61 62
    ColumnUnique(const ColumnUnique & other) : column_holder(other.column_holder), is_nullable(other.is_nullable) {}

63
public:
64
    const ColumnPtr & getNestedColumn() const override { return column_holder; }
65 66 67 68 69 70
    size_t uniqueInsert(const Field & x) override;
    size_t uniqueInsertFrom(const IColumn & src, size_t n) override;
    ColumnPtr uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
    size_t uniqueInsertData(const char * pos, size_t length) override;
    size_t uniqueInsertDataWithTerminatingZero(const char * pos, size_t length) override;
    size_t uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos) override;
71

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
    size_t getDefaultValueIndex() const override { return is_nullable ? 1 : 0; }
    size_t getNullValueIndex() const override;
    bool canContainNulls() const override { return is_nullable; }

    Field operator[](size_t n) const override { return (*column_holder)[n]; }
    void get(size_t n, Field & res) const override { column_holder->get(n, res); }
    StringRef getDataAt(size_t n) const override { return column_holder->getDataAt(n); }
    StringRef getDataAtWithTerminatingZero(size_t n) const override { return column_holder->getDataAtWithTerminatingZero(n); }
    UInt64 get64(size_t n) const override { return column_holder->get64(n); }
    UInt64 getUInt(size_t n) const override { return column_holder->getUInt(n); }
    Int64 getInt(size_t n) const override { return column_holder->getInt(n); }
    bool isNullAt(size_t n) const override { return column_holder->isNullAt(n); }
    MutableColumnPtr cut(size_t start, size_t length) const override { return column_holder->cut(start, length); }
    StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override { return column_holder->serializeValueIntoArena(n, arena, begin); }
    void updateHashWithValue(size_t n, SipHash & hash) const override { return column_holder->updateHashWithValue(n, hash); }
87 88 89 90 91 92 93 94 95 96
    MutableColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override { return column_holder->filter(filt, result_size_hint); }
    MutableColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override { return column_holder->permute(perm, limit); }
    int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override { return column_holder->compareAt(n, m, rhs, nan_direction_hint); }
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override { column_holder->getPermutation(reverse, limit, nan_direction_hint, res); }
    MutableColumnPtr replicate(const IColumn::Offsets & offsets) const override
    {
        auto holder = column_holder;
        return std::move(holder)->mutate()->replicate(offsets);
    }
    std::vector<MutableColumnPtr> scatter(IColumn::ColumnIndex num_columns, const IColumn::Selector & selector) const override { return column_holder->scatter(num_columns, selector); }
97 98 99 100 101 102 103 104
    void getExtremes(Field & min, Field & max) const override { column_holder->getExtremes(min, max); }
    bool valuesHaveFixedSize() const override { return column_holder->valuesHaveFixedSize(); }
    bool isFixedAndContiguous() const override { return column_holder->isFixedAndContiguous(); }
    size_t sizeOfValueIfFixed() const override { return column_holder->sizeOfValueIfFixed(); }
    bool isNumeric() const override { return column_holder->isNumeric(); }

    size_t byteSize() const override { return column_holder->byteSize(); }
    size_t allocatedBytes() const override { return column_holder->allocatedBytes() + (index ? index->getBufferSizeInBytes() : 0); }
105
    void forEachSubcolumn(IColumn::ColumnCallback callback) override { callback(column_holder); }
106 107 108

private:

109
    using IndexMapType = HashMap<StringRefWrapper<ColumnType>, IndexType, StringRefHash>;
110

111
    ColumnPtr column_holder;
112
    /// Lazy initialized.
113 114 115
    std::unique_ptr<IndexMapType> index;

    bool is_nullable;
116

117 118 119
    size_t numSpecialValues() const { return is_nullable ? 2 : 1; }

    void buildIndex();
120 121 122
    ColumnType * getRawColumnPtr() { return static_cast<ColumnType *>(column_holder->assumeMutable().get()); }
    const ColumnType * getRawColumnPtr() const { return static_cast<ColumnType *>(column_holder.get()); }
    IndexType insert(const StringRefWrapper<ColumnType> & ref, IndexType value);
123 124 125

};

126
template <typename ColumnType, typename IndexType>
127
ColumnUnique<ColumnType, IndexType>::ColumnUnique(MutableColumnPtr && holder) : column_holder(std::move(holder))
128 129 130 131 132 133 134 135 136 137
{
    if (column_holder->isColumnNullable())
    {
        auto column_nullable = static_cast<const ColumnNullable *>(column_holder.get());
        column_holder = column_nullable->getNestedColumnPtr();
        is_nullable = true;
    }
}

template <typename ColumnType, typename IndexType>
138
size_t ColumnUnique<ColumnType, IndexType>::getNullValueIndex() const
139 140 141 142 143 144 145 146
{
    if (!is_nullable)
        throw Exception("ColumnUnique can't contain null values.");

    return 0;
}

template <typename ColumnType, typename IndexType>
147
void ColumnUnique<ColumnType, IndexType>::buildIndex()
148 149 150 151 152 153 154 155 156
{
    if (index)
        return;

    auto column = getRawColumnPtr();
    index = std::make_unique<IndexMapType>();

    for (auto row : ext::range(numSpecialValues(), column->size()))
    {
157
        (*index)[StringRefWrapper<ColumnType>(column, row)] = row;
158 159 160 161
    }
}

template <typename ColumnType, typename IndexType>
162
IndexType ColumnUnique<ColumnType, IndexType>::insert(const StringRefWrapper<ColumnType> & ref, IndexType value)
163 164 165 166
{
    if (!index)
        buildIndex();

167 168
    using IteratorType = typename IndexMapType::iterator;
    IteratorType it;
169 170 171 172 173 174 175 176 177 178
    bool inserted;
    index->emplace(ref, it, inserted);

    if (inserted)
        it->second = value;

    return it->second;
}

template <typename ColumnType, typename IndexType>
179
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsert(const Field & x)
180 181 182 183 184
{
    if (x.getType() == Field::Types::Null)
        return getNullValueIndex();

    auto column = getRawColumnPtr();
185
    auto prev_size = static_cast<IndexType>(column->size());
186 187 188 189 190

    if ((*column)[getDefaultValueIndex()] == x)
        return getDefaultValueIndex();

    column->insert(x);
191
    auto pos = insert(StringRefWrapper<ColumnType>(column, prev_size), prev_size);
192 193 194 195 196 197 198
    if (pos != prev_size)
        column->popBack(1);

    return static_cast<size_t>(pos);
}

template <typename ColumnType, typename IndexType>
199
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsertFrom(const IColumn & src, size_t n)
200 201 202 203 204 205
{
    auto ref = src.getDataAt(n);
    return uniqueInsertData(ref.data, ref.size);
}

template <typename ColumnType, typename IndexType>
206
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsertData(const char * pos, size_t length)
207 208 209
{
    auto column = getRawColumnPtr();

210
    if (column->getDataAt(getDefaultValueIndex()) == StringRef(pos, length))
211 212
        return getDefaultValueIndex();

213
    auto size = static_cast<IndexType>(column->size());
214

215
    if (!index->has(StringRefWrapper<ColumnType>(StringRef(pos, length))))
216
    {
217 218
        column->insertData(pos, length);
        return static_cast<size_t>(insert(StringRefWrapper<ColumnType>(StringRef(pos, length)), size));
219 220 221 222 223 224
    }

    return size;
}

template <typename ColumnType, typename IndexType>
225
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsertDataWithTerminatingZero(const char * pos, size_t length)
226 227
{
    if (std::is_same<ColumnType, ColumnString>::value)
228
        return uniqueInsertData(pos, length - 1);
229 230

    if (column_holder->valuesHaveFixedSize())
231
        return uniqueInsertData(pos, length);
232 233 234 235 236

    /// Don't know if data actually has terminating zero. So, insert it firstly.

    auto column = getRawColumnPtr();
    size_t prev_size = column->size();
237
    column->insertDataWithTerminatingZero(pos, length);
238 239 240 241 242 243 244

    if (column->compareAt(getDefaultValueIndex(), prev_size, *column, 1) == 0)
    {
        column->popBack(1);
        return getDefaultValueIndex();
    }

245 246
    auto position = insert(StringRefWrapper<ColumnType>(column, prev_size), prev_size);
    if (position != prev_size)
247 248
        column->popBack(1);

249
    return static_cast<size_t>(position);
250 251 252
}

template <typename ColumnType, typename IndexType>
253
size_t ColumnUnique<ColumnType, IndexType>::uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos)
254 255 256 257 258 259 260 261 262 263 264
{
    auto column = getRawColumnPtr();
    size_t prev_size = column->size();
    new_pos = column->deserializeAndInsertFromArena(pos);

    if (column->compareAt(getDefaultValueIndex(), prev_size, *column, 1) == 0)
    {
        column->popBack(1);
        return getDefaultValueIndex();
    }

265
    auto index_pos = insert(StringRefWrapper<ColumnType>(column, prev_size), prev_size);
266 267 268 269 270 271 272
    if (index_pos != prev_size)
        column->popBack(1);

    return static_cast<size_t>(index_pos);
}

template <typename ColumnType, typename IndexType>
273
ColumnPtr ColumnUnique<ColumnType, IndexType>::uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length)
274 275 276 277 278 279 280
{
    if (!index)
        buildIndex();

    const ColumnType * src_column;
    const NullMap * null_map = nullptr;

281
    if (src.isColumnNullable())
282 283 284 285 286 287 288 289 290 291
    {
        auto nullable_column = static_cast<const ColumnNullable *>(&src);
        src_column = static_cast<const ColumnType *>(&nullable_column->getNestedColumn());
        null_map = &nullable_column->getNullMapData();
    }
    else
        src_column = static_cast<const ColumnType *>(&src);

    auto column = getRawColumnPtr();
    auto positions_column = ColumnVector<IndexType>::create(length);
292
    auto & positions = positions_column->getData();
293 294 295 296 297 298 299 300 301 302 303 304

    size_t next_position = column->size();
    for (auto i : ext::range(0, length))
    {
        auto row = start + i;

        if (column->compareAt(getDefaultValueIndex(), row, *src_column, 1) == 0)
            positions[i] = getDefaultValueIndex();
        else if (null_map && (*null_map)[row])
            positions[i] = getNullValueIndex();
        else
        {
305
            auto it = index->find(StringRefWrapper<ColumnType>(src_column, row));
306 307 308
            if (it == index->end())
            {
                positions[i] = next_position;
309 310 311
                auto ref = src_column->getDataAt(row);
                column->insertData(ref.data, ref.size);
                (*index)[StringRefWrapper<ColumnType>(column, next_position)] = next_position;
312 313 314 315 316 317 318 319 320 321
                ++next_position;
            }
            else
                positions[i] = it->second;
        }
    }

    return positions_column;
}

322
}