ColumnUnique.h 20.4 KB
Newer Older
1
#pragma once
2
#include <Columns/IColumnUnique.h>
3 4
#include <Columns/ReverseIndex.h>

5 6
#include <Columns/ColumnVector.h>
#include <Columns/ColumnNullable.h>
7
#include <Columns/ColumnString.h>
8
#include <Columns/ColumnFixedString.h>
9

N
Nikolai Kochetov 已提交
10
#include <DataTypes/DataTypeNullable.h>
11
#include <DataTypes/NumberTraits.h>
12

13 14
#include <Common/typeid_cast.h>
#include <ext/range.h>
15

A
Alexey Milovidov 已提交
16 17 18
#include <common/unaligned.h>


19 20 21
namespace DB
{

P
proller 已提交
22 23 24 25
namespace ErrorCodes
{
    extern const int ILLEGAL_COLUMN;
}
26

27
template <typename ColumnType>
28
class ColumnUnique final : public COWHelper<IColumnUnique, ColumnUnique<ColumnType>>
29
{
30
    friend class COWHelper<IColumnUnique, ColumnUnique<ColumnType>>;
31 32

private:
33
    explicit ColumnUnique(MutableColumnPtr && holder, bool is_nullable);
34
    explicit ColumnUnique(const IDataType & type);
35
    ColumnUnique(const ColumnUnique & other);
36

37
public:
38 39 40
    MutableColumnPtr cloneEmpty() const override;

    const ColumnPtr & getNestedColumn() const override;
41
    const ColumnPtr & getNestedNotNullableColumn() const override { return column_holder; }
42
    bool nestedColumnIsNullable() const override { return is_nullable; }
43

44 45
    size_t uniqueInsert(const Field & x) override;
    size_t uniqueInsertFrom(const IColumn & src, size_t n) override;
46 47 48
    MutableColumnPtr uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
    IColumnUnique::IndexesWithOverflow uniqueInsertRangeWithOverflow(const IColumn & src, size_t start, size_t length,
                                                                     size_t max_dictionary_size) override;
49 50
    size_t uniqueInsertData(const char * pos, size_t length) override;
    size_t uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos) override;
51

52
    size_t getDefaultValueIndex() const override { return 0; }
53
    size_t getNullValueIndex() const override;
54
    size_t getNestedTypeDefaultValueIndex() const override { return is_nullable ? 1 : 0; }
55 56
    bool canContainNulls() const override { return is_nullable; }

57 58 59
    Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
    void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
    StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
N
Nikolai Kochetov 已提交
60 61
    StringRef getDataAtWithTerminatingZero(size_t n) const override
    {
62
        return getNestedColumn()->getDataAtWithTerminatingZero(n);
N
Nikolai Kochetov 已提交
63
    }
64 65 66
    UInt64 get64(size_t n) const override { return getNestedColumn()->get64(n); }
    UInt64 getUInt(size_t n) const override { return getNestedColumn()->getUInt(n); }
    Int64 getInt(size_t n) const override { return getNestedColumn()->getInt(n); }
67
    bool isNullAt(size_t n) const override { return is_nullable && n == getNullValueIndex(); }
68
    StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
69
    void updateHashWithValue(size_t n, SipHash & hash_func) const override
N
Nikolai Kochetov 已提交
70
    {
71
        return getNestedColumn()->updateHashWithValue(n, hash_func);
N
Nikolai Kochetov 已提交
72
    }
73

74
    int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override;
75

76 77 78 79 80 81 82
    void getExtremes(Field & min, Field & max) const override { column_holder->getExtremes(min, max); }
    bool valuesHaveFixedSize() const override { return column_holder->valuesHaveFixedSize(); }
    bool isFixedAndContiguous() const override { return column_holder->isFixedAndContiguous(); }
    size_t sizeOfValueIfFixed() const override { return column_holder->sizeOfValueIfFixed(); }
    bool isNumeric() const override { return column_holder->isNumeric(); }

    size_t byteSize() const override { return column_holder->byteSize(); }
83
    void protect() override { column_holder->protect(); }
N
Nikolai Kochetov 已提交
84 85
    size_t allocatedBytes() const override
    {
86
        return column_holder->allocatedBytes()
87
               + index.allocatedBytes()
88
               + (nested_null_mask ? nested_null_mask->allocatedBytes() : 0);
89 90 91
    }
    void forEachSubcolumn(IColumn::ColumnCallback callback) override
    {
92
        callback(column_holder);
93
        index.setColumn(getRawColumnPtr());
94 95
        if (is_nullable)
            nested_column_nullable = ColumnNullable::create(column_holder, nested_null_mask);
N
Nikolai Kochetov 已提交
96
    }
97

98 99 100 101 102 103 104
    bool structureEquals(const IColumn & rhs) const override
    {
        if (auto rhs_concrete = typeid_cast<const ColumnUnique *>(&rhs))
            return column_holder->structureEquals(*rhs_concrete->column_holder);
        return false;
    }

105 106
    const UInt64 * tryGetSavedHash() const override { return index.tryGetSavedHash(); }

107 108
    UInt128 getHash() const override { return hash.getHash(*getRawColumnPtr()); }

109 110
private:

111
    IColumn::WrappedPtr column_holder;
112
    bool is_nullable;
113
    size_t size_of_value_if_fixed = 0;
114
    ReverseIndex<UInt64, ColumnType> index;
N
Nikolai Kochetov 已提交
115

116
    /// For DataTypeNullable, stores null map.
117 118
    IColumn::WrappedPtr nested_null_mask;
    IColumn::WrappedPtr nested_column_nullable;
N
Nikolai Kochetov 已提交
119

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
    class IncrementalHash
    {
    private:
        UInt128 hash;
        std::atomic<size_t> num_added_rows;

        std::mutex mutex;
    public:
        IncrementalHash() : num_added_rows(0) {}

        UInt128 getHash(const ColumnType & column);
    };

    mutable IncrementalHash hash;

135
    void createNullMask();
136 137
    void updateNullMask();

138 139
    static size_t numSpecialValues(bool is_nullable) { return is_nullable ? 2 : 1; }
    size_t numSpecialValues() const { return numSpecialValues(is_nullable); }
140

141
    ColumnType * getRawColumnPtr() { return static_cast<ColumnType *>(column_holder.get()); }
142
    const ColumnType * getRawColumnPtr() const { return static_cast<const ColumnType *>(column_holder.get()); }
143

144 145
    template <typename IndexType>
    MutableColumnPtr uniqueInsertRangeImpl(
146 147 148
        const IColumn & src,
        size_t start,
        size_t length,
149
        size_t num_added_rows,
150
        typename ColumnVector<IndexType>::MutablePtr && positions_column,
151
        ReverseIndex<UInt64, ColumnType> * secondary_index,
152
        size_t max_dictionary_size);
153 154
};

155 156 157 158 159 160
template <typename ColumnType>
MutableColumnPtr ColumnUnique<ColumnType>::cloneEmpty() const
{
    return ColumnUnique<ColumnType>::create(column_holder->cloneResized(numSpecialValues()), is_nullable);
}

161
template <typename ColumnType>
162 163 164
ColumnUnique<ColumnType>::ColumnUnique(const ColumnUnique & other)
    : column_holder(other.column_holder)
    , is_nullable(other.is_nullable)
165
    , size_of_value_if_fixed (other.size_of_value_if_fixed)
166 167 168
    , index(numSpecialValues(is_nullable), 0)
{
    index.setColumn(getRawColumnPtr());
169
    createNullMask();
170 171 172 173 174 175
}

template <typename ColumnType>
ColumnUnique<ColumnType>::ColumnUnique(const IDataType & type)
    : is_nullable(type.isNullable())
    , index(numSpecialValues(is_nullable), 0)
N
Nikolai Kochetov 已提交
176
{
177 178
    const auto & holder_type = is_nullable ? *static_cast<const DataTypeNullable &>(type).getNestedType() : type;
    column_holder = holder_type.createColumn()->cloneResized(numSpecialValues());
179
    index.setColumn(getRawColumnPtr());
180
    createNullMask();
181 182 183

    if (column_holder->valuesHaveFixedSize())
        size_of_value_if_fixed = column_holder->sizeOfValueIfFixed();
N
Nikolai Kochetov 已提交
184 185
}

186 187
template <typename ColumnType>
ColumnUnique<ColumnType>::ColumnUnique(MutableColumnPtr && holder, bool is_nullable)
188 189 190
    : column_holder(std::move(holder))
    , is_nullable(is_nullable)
    , index(numSpecialValues(is_nullable), 0)
191
{
192 193
    if (column_holder->size() < numSpecialValues())
        throw Exception("Too small holder column for ColumnUnique.", ErrorCodes::ILLEGAL_COLUMN);
194
    if (column_holder->isColumnNullable())
195
        throw Exception("Holder column for ColumnUnique can't be nullable.", ErrorCodes::ILLEGAL_COLUMN);
196 197

    index.setColumn(getRawColumnPtr());
198
    createNullMask();
199 200 201

    if (column_holder->valuesHaveFixedSize())
        size_of_value_if_fixed = column_holder->sizeOfValueIfFixed();
202 203
}

204
template <typename ColumnType>
205
void ColumnUnique<ColumnType>::createNullMask()
N
Nikolai Kochetov 已提交
206 207 208
{
    if (is_nullable)
    {
209
        size_t size = getRawColumnPtr()->size();
210
        if (!nested_null_mask)
211 212 213
        {
            ColumnUInt8::MutablePtr null_mask = ColumnUInt8::create(size, UInt8(0));
            null_mask->getData()[getNullValueIndex()] = 1;
214 215
            nested_null_mask = std::move(null_mask);
            nested_column_nullable = ColumnNullable::create(column_holder, nested_null_mask);
216
        }
217 218 219 220 221 222 223 224 225 226 227 228 229 230
        else
            throw Exception("Null mask for ColumnUnique is already created.", ErrorCodes::LOGICAL_ERROR);
    }
}

template <typename ColumnType>
void ColumnUnique<ColumnType>::updateNullMask()
{
    if (is_nullable)
    {
        if (!nested_null_mask)
            throw Exception("Null mask for ColumnUnique is was not created.", ErrorCodes::LOGICAL_ERROR);

        size_t size = getRawColumnPtr()->size();
231

232
        if (nested_null_mask->size() != size)
233
            static_cast<ColumnUInt8 &>(*nested_null_mask).getData().resize_fill(size);
N
Nikolai Kochetov 已提交
234
    }
235 236 237 238 239 240 241 242
}

template <typename ColumnType>
const ColumnPtr & ColumnUnique<ColumnType>::getNestedColumn() const
{
    if (is_nullable)
        return nested_column_nullable;

N
Nikolai Kochetov 已提交
243 244 245
    return column_holder;
}

246 247
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::getNullValueIndex() const
248 249
{
    if (!is_nullable)
250
        throw Exception("ColumnUnique can't contain null values.", ErrorCodes::LOGICAL_ERROR);
251 252 253 254

    return 0;
}

255 256
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsert(const Field & x)
257 258 259 260
{
    if (x.getType() == Field::Types::Null)
        return getNullValueIndex();

261 262
    if (size_of_value_if_fixed)
        return uniqueInsertData(&x.get<char>(), size_of_value_if_fixed);
263

264 265
    auto & val = x.get<String>();
    return uniqueInsertData(val.data(), val.size());
266 267
}

268 269
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsertFrom(const IColumn & src, size_t n)
270
{
271 272 273
    if (is_nullable && src.isNullAt(n))
        return getNullValueIndex();

C
chertus 已提交
274
    if (auto * nullable = getNullableColumn(src))
275 276
        return uniqueInsertFrom(nullable->getNestedColumn(), n);

277 278 279 280
    auto ref = src.getDataAt(n);
    return uniqueInsertData(ref.data, ref.size);
}

281 282
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsertData(const char * pos, size_t length)
283 284 285
{
    auto column = getRawColumnPtr();

286 287
    if (column->getDataAt(getNestedTypeDefaultValueIndex()) == StringRef(pos, length))
        return getNestedTypeDefaultValueIndex();
288

289
    auto insertion_point = index.insert(StringRef(pos, length));
290

291 292
    updateNullMask();

293
    return insertion_point;
294 295
}

296 297 298 299 300 301 302 303 304 305 306 307 308 309
template <typename ColumnType>
StringRef ColumnUnique<ColumnType>::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
    if (is_nullable)
    {
        const UInt8 null_flag = 1;
        const UInt8 not_null_flag = 0;

        auto pos = arena.allocContinue(sizeof(null_flag), begin);
        auto & flag = (n == getNullValueIndex() ? null_flag : not_null_flag);
        memcpy(pos, &flag, sizeof(flag));

        size_t nested_size = 0;

310
        if (n != getNullValueIndex())
311 312 313 314 315 316 317 318
            nested_size = column_holder->serializeValueIntoArena(n, arena, begin).size;

        return StringRef(pos, sizeof(null_flag) + nested_size);
    }

    return column_holder->serializeValueIntoArena(n, arena, begin);
}

319 320
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos)
321
{
322 323 324 325 326 327 328 329 330 331 332 333
    if (is_nullable)
    {
        UInt8 val = *reinterpret_cast<const UInt8 *>(pos);
        pos += sizeof(val);

        if (val)
        {
            new_pos = pos;
            return getNullValueIndex();
        }
    }

N
Nikolai Kochetov 已提交
334
    /// Numbers, FixedString
335
    if (size_of_value_if_fixed)
336
    {
337 338
        new_pos = pos + size_of_value_if_fixed;
        return uniqueInsertData(pos, size_of_value_if_fixed);
339 340
    }

N
Nikolai Kochetov 已提交
341
    /// String
A
Alexey Milovidov 已提交
342
    const size_t string_size = unalignedLoad<size_t>(pos);
343 344
    pos += sizeof(string_size);
    new_pos = pos + string_size;
345

N
Nikolai Kochetov 已提交
346 347
    /// -1 because of terminating zero
    return uniqueInsertData(pos, string_size - 1);
348 349
}

350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
template <typename ColumnType>
int ColumnUnique<ColumnType>::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
{
    if (is_nullable)
    {
        /// See ColumnNullable::compareAt
        bool lval_is_null = n == getNullValueIndex();
        bool rval_is_null = m == getNullValueIndex();

        if (unlikely(lval_is_null || rval_is_null))
        {
            if (lval_is_null && rval_is_null)
                return 0;
            else
                return lval_is_null ? nan_direction_hint : -nan_direction_hint;
        }
    }

    auto & column_unique = static_cast<const IColumnUnique &>(rhs);
    return getNestedColumn()->compareAt(n, m, *column_unique.getNestedColumn(), nan_direction_hint);
}

372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
template <typename IndexType>
static void checkIndexes(const ColumnVector<IndexType> & indexes, size_t max_dictionary_size)
{
    auto & data = indexes.getData();
    for (size_t i = 0; i < data.size(); ++i)
    {
        if (data[i] >= max_dictionary_size)
        {
            throw Exception("Found index " + toString(data[i]) + " at position " + toString(i)
                            + " which is grated or equal than dictionary size " + toString(max_dictionary_size),
                            ErrorCodes::LOGICAL_ERROR);
        }
    }
}

387 388 389
template <typename ColumnType>
template <typename IndexType>
MutableColumnPtr ColumnUnique<ColumnType>::uniqueInsertRangeImpl(
390 391 392
    const IColumn & src,
    size_t start,
    size_t length,
393
    size_t num_added_rows,
394
    typename ColumnVector<IndexType>::MutablePtr && positions_column,
395
    ReverseIndex<UInt64, ColumnType> * secondary_index,
396
    size_t max_dictionary_size)
397 398 399
{
    const ColumnType * src_column;
    const NullMap * null_map = nullptr;
400 401
    auto & positions = positions_column->getData();

402
    auto update_position = [&](UInt64 & next_position) -> MutableColumnPtr
403
    {
404 405 406
        constexpr auto next_size = NumberTraits::nextSize(sizeof(IndexType));
        using SuperiorIndexType = typename NumberTraits::Construct<false, false, next_size>::Type;

407 408 409 410 411 412 413 414
        ++next_position;

        if (next_position > std::numeric_limits<IndexType>::max())
        {
            if (sizeof(SuperiorIndexType) == sizeof(IndexType))
                throw Exception("Can't find superior index type for type " + demangle(typeid(IndexType).name()),
                                ErrorCodes::LOGICAL_ERROR);

415
            auto expanded_column = ColumnVector<SuperiorIndexType>::create(length);
416 417 418 419 420 421
            auto & expanded_data = expanded_column->getData();
            for (size_t i = 0; i < num_added_rows; ++i)
                expanded_data[i] = positions[i];

            return uniqueInsertRangeImpl<SuperiorIndexType>(
                    src,
422 423 424
                    start,
                    length,
                    num_added_rows,
425
                    std::move(expanded_column),
426
                    secondary_index,
427 428 429 430 431
                    max_dictionary_size);
        }

        return nullptr;
    };
432

C
chertus 已提交
433
    if (auto * nullable_column = getNullableColumn(src))
434
    {
435
        src_column = typeid_cast<const ColumnType *>(&nullable_column->getNestedColumn());
436 437 438
        null_map = &nullable_column->getNullMapData();
    }
    else
439
        src_column = typeid_cast<const ColumnType *>(&src);
440

441 442 443
    if (src_column == nullptr)
        throw Exception("Invalid column type for ColumnUnique::insertRangeFrom. Expected " + column_holder->getName() +
                        ", got " + src.getName(), ErrorCodes::ILLEGAL_COLUMN);
444

445 446
    auto column = getRawColumnPtr();

447
    UInt64 next_position = column->size();
448 449 450
    if (secondary_index)
        next_position += secondary_index->size();

451
    auto insert_key = [&](const StringRef & ref, ReverseIndex<UInt64, ColumnType> & cur_index) -> MutableColumnPtr
452
    {
453 454 455 456
        auto inserted_pos = cur_index.insert(ref);
        positions[num_added_rows] = inserted_pos;
        if (inserted_pos == next_position)
            return update_position(next_position);
457

458
        return nullptr;
459 460
    };

461
    for (; num_added_rows < length; ++num_added_rows)
462
    {
463
        auto row = start + num_added_rows;
464

N
Nikolai Kochetov 已提交
465
        if (null_map && (*null_map)[row])
466
            positions[num_added_rows] = getNullValueIndex();
467 468
        else if (column->compareAt(getNestedTypeDefaultValueIndex(), row, *src_column, 1) == 0)
            positions[num_added_rows] = getNestedTypeDefaultValueIndex();
469 470
        else
        {
471
            auto ref = src_column->getDataAt(row);
472
            MutableColumnPtr res = nullptr;
473

474
            if (secondary_index && next_position >= max_dictionary_size)
475
            {
476 477 478
                auto insertion_point = index.getInsertionPoint(ref);
                if (insertion_point == index.lastInsertionPoint())
                    res = insert_key(ref, *secondary_index);
479
                else
480
                    positions[num_added_rows] = insertion_point;
481
            }
482 483 484 485 486
            else
                res = insert_key(ref, index);

            if (res)
                return res;
487 488
        }
    }
489

490
    // checkIndexes(*positions_column, column->size() + (overflowed_keys ? overflowed_keys->size() : 0));
491
    return std::move(positions_column);
492 493
}

494 495
template <typename ColumnType>
MutableColumnPtr ColumnUnique<ColumnType>::uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length)
496
{
497
    auto callForType = [this, &src, start, length](auto x) -> MutableColumnPtr
498
    {
499 500
        size_t size = getRawColumnPtr()->size();

501 502 503
        using IndexType = decltype(x);
        if (size <= std::numeric_limits<IndexType>::max())
        {
504
            auto positions = ColumnVector<IndexType>::create(length);
505
            return this->uniqueInsertRangeImpl<IndexType>(src, start, length, 0, std::move(positions), nullptr, 0);
506
        }
507

508 509 510 511 512 513 514 515 516 517 518 519 520 521
        return nullptr;
    };

    MutableColumnPtr positions_column;
    if (!positions_column)
        positions_column = callForType(UInt8());
    if (!positions_column)
        positions_column = callForType(UInt16());
    if (!positions_column)
        positions_column = callForType(UInt32());
    if (!positions_column)
        positions_column = callForType(UInt64());
    if (!positions_column)
        throw Exception("Can't find index type for ColumnUnique", ErrorCodes::LOGICAL_ERROR);
522

523 524
    updateNullMask();

525 526 527
    return positions_column;
}

528 529
template <typename ColumnType>
IColumnUnique::IndexesWithOverflow ColumnUnique<ColumnType>::uniqueInsertRangeWithOverflow(
530 531 532 533 534 535 536 537 538 539
    const IColumn & src,
    size_t start,
    size_t length,
    size_t max_dictionary_size)
{
    auto overflowed_keys = column_holder->cloneEmpty();
    auto overflowed_keys_ptr = typeid_cast<ColumnType *>(overflowed_keys.get());
    if (!overflowed_keys_ptr)
        throw Exception("Invalid keys type for ColumnUnique.", ErrorCodes::LOGICAL_ERROR);

540
    auto callForType = [this, &src, start, length, overflowed_keys_ptr, max_dictionary_size](auto x) -> MutableColumnPtr
541
    {
542 543
        size_t size = getRawColumnPtr()->size();

544 545 546
        using IndexType = decltype(x);
        if (size <= std::numeric_limits<IndexType>::max())
        {
547
            auto positions = ColumnVector<IndexType>::create(length);
548 549
            ReverseIndex<UInt64, ColumnType> secondary_index(0, max_dictionary_size);
            secondary_index.setColumn(overflowed_keys_ptr);
550
            return this->uniqueInsertRangeImpl<IndexType>(src, start, length, 0, std::move(positions),
551
                                                          &secondary_index, max_dictionary_size);
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
        }

        return nullptr;
    };

    MutableColumnPtr positions_column;
    if (!positions_column)
        positions_column = callForType(UInt8());
    if (!positions_column)
        positions_column = callForType(UInt16());
    if (!positions_column)
        positions_column = callForType(UInt32());
    if (!positions_column)
        positions_column = callForType(UInt64());
    if (!positions_column)
        throw Exception("Can't find index type for ColumnUnique", ErrorCodes::LOGICAL_ERROR);
568

569 570
    updateNullMask();

571 572 573 574 575 576
    IColumnUnique::IndexesWithOverflow indexes_with_overflow;
    indexes_with_overflow.indexes = std::move(positions_column);
    indexes_with_overflow.overflowed_keys = std::move(overflowed_keys);
    return indexes_with_overflow;
}

577 578 579 580
template <typename ColumnType>
UInt128 ColumnUnique<ColumnType>::IncrementalHash::getHash(const ColumnType & column)
{
    size_t column_size = column.size();
581
    UInt128 cur_hash;
582 583 584 585 586 587 588 589

    if (column_size != num_added_rows.load())
    {
        SipHash sip_hash;
        for (size_t i = 0; i < column_size; ++i)
            column.updateHashWithValue(i, sip_hash);

        std::lock_guard lock(mutex);
590 591 592 593 594 595 596 597
        sip_hash.get128(hash.low, hash.high);
        cur_hash = hash;
        num_added_rows.store(column_size);
    }
    else
    {
        std::lock_guard lock(mutex);
        cur_hash = hash;
598 599 600 601 602
    }

    return cur_hash;
}

603
}