ColumnUnique.h 13.2 KB
Newer Older
1
#pragma once
2 3
#include <Columns/IColumnUnique.h>
#include <Common/HashTable/HashMap.h>
4 5 6 7
#include <ext/range.h>
#include <Common/typeid_cast.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnNullable.h>
8
#include <Columns/ColumnString.h>
N
Nikolai Kochetov 已提交
9
#include <DataTypes/DataTypeNullable.h>
10

11
class NullMap;
12 13 14 15 16 17 18 19 20 21 22 23 24


template <typename ColumnType>
struct StringRefWrapper
{
    const ColumnType * column = nullptr;
    size_t row = 0;

    StringRef ref;

    StringRefWrapper(const ColumnType * column, size_t row) : column(column), row(row) {}
    StringRefWrapper(StringRef ref) : ref(ref) {}
    StringRefWrapper(const StringRefWrapper & other) = default;
25 26
    StringRefWrapper & operator =(int) { column = nullptr; ref.data = nullptr; return *this; }
    bool operator ==(int) const { return nullptr == column && nullptr == ref.data; }
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
    StringRefWrapper() {}

    operator StringRef() const { return column ? column->getDataAt(row) : ref; }

    bool operator==(const StringRefWrapper<ColumnType> & other) const
    {
        return (column && column == other.column && row == other.row) || StringRef(*this) == other;
    }

};

namespace ZeroTraits
{
    template <typename ColumnType>
    bool check(const StringRefWrapper<ColumnType> x) { return nullptr == x.column; }

    template <typename ColumnType>
    void set(StringRefWrapper<ColumnType> & x) { x.column = nullptr; }
};


48 49 50
namespace DB
{

51
template <typename ColumnType, typename IndexType>
52
class ColumnUnique final : public COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType, IndexType>>
53
{
54
    friend class COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType, IndexType>>;
55 56

private:
57
    explicit ColumnUnique(MutableColumnPtr && holder);
N
Nikolai Kochetov 已提交
58
    explicit ColumnUnique(const DataTypePtr & type);
59 60 61
    ColumnUnique(const ColumnUnique & other)
            : column_holder(other.column_holder), nullable_column(other.nullable_column)
            , nullable_column_map(other.nullable_column_map), is_nullable(other.is_nullable) {}
62

63
public:
N
Nikolai Kochetov 已提交
64
    const ColumnPtr & getNestedColumn() const override;
65 66 67 68 69 70
    size_t uniqueInsert(const Field & x) override;
    size_t uniqueInsertFrom(const IColumn & src, size_t n) override;
    ColumnPtr uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
    size_t uniqueInsertData(const char * pos, size_t length) override;
    size_t uniqueInsertDataWithTerminatingZero(const char * pos, size_t length) override;
    size_t uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos) override;
71

72 73 74 75
    size_t getDefaultValueIndex() const override { return is_nullable ? 1 : 0; }
    size_t getNullValueIndex() const override;
    bool canContainNulls() const override { return is_nullable; }

76 77 78
    Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
    void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
    StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
N
Nikolai Kochetov 已提交
79 80
    StringRef getDataAtWithTerminatingZero(size_t n) const override
    {
81
        return getNestedColumn()->getDataAtWithTerminatingZero(n);
N
Nikolai Kochetov 已提交
82
    }
83 84 85
    UInt64 get64(size_t n) const override { return getNestedColumn()->get64(n); }
    UInt64 getUInt(size_t n) const override { return getNestedColumn()->getUInt(n); }
    Int64 getInt(size_t n) const override { return getNestedColumn()->getInt(n); }
86
    bool isNullAt(size_t n) const override { return is_nullable && n == getNullValueIndex(); }
N
Nikolai Kochetov 已提交
87 88 89 90 91 92
    StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override
    {
        return column_holder->serializeValueIntoArena(n, arena, begin);
    }
    void updateHashWithValue(size_t n, SipHash & hash) const override
    {
93
        return getNestedColumn()->updateHashWithValue(n, hash);
N
Nikolai Kochetov 已提交
94
    }
95

N
Nikolai Kochetov 已提交
96 97
    int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override
    {
98 99
        auto & column_unique = static_cast<const IColumnUnique&>(rhs);
        return getNestedColumn()->compareAt(n, m, *column_unique.getNestedColumn(), nan_direction_hint);
100
    }
101

102 103 104 105 106 107 108
    void getExtremes(Field & min, Field & max) const override { column_holder->getExtremes(min, max); }
    bool valuesHaveFixedSize() const override { return column_holder->valuesHaveFixedSize(); }
    bool isFixedAndContiguous() const override { return column_holder->isFixedAndContiguous(); }
    size_t sizeOfValueIfFixed() const override { return column_holder->sizeOfValueIfFixed(); }
    bool isNumeric() const override { return column_holder->isNumeric(); }

    size_t byteSize() const override { return column_holder->byteSize(); }
N
Nikolai Kochetov 已提交
109 110
    size_t allocatedBytes() const override
    {
111 112 113 114 115 116 117 118 119 120 121 122 123 124
        return column_holder->allocatedBytes()
               + (index ? index->getBufferSizeInBytes() : 0)
               + (nullable_column ? nullable_column->allocatedBytes() : 0);
    }
    void forEachSubcolumn(IColumn::ColumnCallback callback) override
    {
        callback(is_nullable ? nullable_column : column_holder);
        /// If column was mutated, we need to restore ptrs.
        if (is_nullable)
        {
            auto & column_nullable = static_cast<ColumnNullable &>(nullable_column->assumeMutableRef());
            column_holder = column_nullable.getNestedColumnPtr();
            nullable_column_map = &column_nullable.getNullMapData();
        }
N
Nikolai Kochetov 已提交
125
    }
126 127 128

private:

129
    using IndexMapType = HashMap<StringRefWrapper<ColumnType>, IndexType, StringRefHash>;
130

131
    ColumnPtr column_holder;
N
Nikolai Kochetov 已提交
132 133 134 135 136

    /// For DataTypeNullable, nullptr otherwise.
    ColumnPtr nullable_column;
    NullMap * nullable_column_map = nullptr;

137
    /// Lazy initialized.
138 139 140
    std::unique_ptr<IndexMapType> index;

    bool is_nullable;
141

142 143 144
    size_t numSpecialValues() const { return is_nullable ? 2 : 1; }

    void buildIndex();
145 146
    ColumnType * getRawColumnPtr() { return static_cast<ColumnType *>(column_holder->assumeMutable().get()); }
    const ColumnType * getRawColumnPtr() const { return static_cast<ColumnType *>(column_holder.get()); }
147
    IndexType insertIntoMap(const StringRefWrapper<ColumnType> & ref, IndexType value);
148 149 150

};

N
Nikolai Kochetov 已提交
151 152 153 154 155 156 157 158 159
template <typename ColumnType, typename IndexType>
ColumnUnique<ColumnType, IndexType>::ColumnUnique(const DataTypePtr & type) : is_nullable(type->isNullable())
{
    if (is_nullable)
    {
        nullable_column = type->createColumn()->cloneResized(numSpecialValues());
        auto & column_nullable = static_cast<ColumnNullable &>(nullable_column->assumeMutableRef());
        column_holder = column_nullable.getNestedColumnPtr();
        nullable_column_map = &column_nullable.getNullMapData();
160
        (*nullable_column_map)[getDefaultValueIndex()] = 0;
N
Nikolai Kochetov 已提交
161 162 163 164 165
    }
    else
        column_holder = type->createColumn()->cloneResized(numSpecialValues());
}

166
template <typename ColumnType, typename IndexType>
167
ColumnUnique<ColumnType, IndexType>::ColumnUnique(MutableColumnPtr && holder) : column_holder(std::move(holder))
168 169 170
{
    if (column_holder->isColumnNullable())
    {
N
Nikolai Kochetov 已提交
171 172 173 174
        nullable_column = std::move(column_holder);
        auto & column_nullable = static_cast<ColumnNullable &>(nullable_column->assumeMutableRef());
        column_holder = column_nullable.getNestedColumnPtr();
        nullable_column_map = &column_nullable.getNullMapData();
175 176 177 178
        is_nullable = true;
    }
}

N
Nikolai Kochetov 已提交
179 180 181 182 183 184 185 186 187 188 189
template <typename ColumnType, typename IndexType>
const ColumnPtr& ColumnUnique<ColumnType, IndexType>::getNestedColumn() const
{
    if (is_nullable)
    {
        nullable_column_map->resize_fill(column_holder->size());
        return nullable_column;
    }
    return column_holder;
}

190
template <typename ColumnType, typename IndexType>
191
size_t ColumnUnique<ColumnType, IndexType>::getNullValueIndex() const
192 193 194 195 196 197 198 199
{
    if (!is_nullable)
        throw Exception("ColumnUnique can't contain null values.");

    return 0;
}

template <typename ColumnType, typename IndexType>
200
void ColumnUnique<ColumnType, IndexType>::buildIndex()
201 202 203 204 205 206 207 208 209
{
    if (index)
        return;

    auto column = getRawColumnPtr();
    index = std::make_unique<IndexMapType>();

    for (auto row : ext::range(numSpecialValues(), column->size()))
    {
210
        (*index)[StringRefWrapper<ColumnType>(column, row)] = row;
211 212 213 214
    }
}

template <typename ColumnType, typename IndexType>
215
IndexType ColumnUnique<ColumnType, IndexType>::insertIntoMap(const StringRefWrapper<ColumnType> & ref, IndexType value)
216 217 218 219
{
    if (!index)
        buildIndex();

220 221
    using IteratorType = typename IndexMapType::iterator;
    IteratorType it;
222 223 224 225 226 227 228 229 230 231
    bool inserted;
    index->emplace(ref, it, inserted);

    if (inserted)
        it->second = value;

    return it->second;
}

template <typename ColumnType, typename IndexType>
232
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsert(const Field & x)
233 234 235 236 237
{
    if (x.getType() == Field::Types::Null)
        return getNullValueIndex();

    auto column = getRawColumnPtr();
238
    auto prev_size = static_cast<IndexType>(column->size());
239 240 241 242 243

    if ((*column)[getDefaultValueIndex()] == x)
        return getDefaultValueIndex();

    column->insert(x);
244
    auto pos = insertIntoMap(StringRefWrapper<ColumnType>(column, prev_size), prev_size);
245 246 247
    if (pos != prev_size)
        column->popBack(1);

248
    return pos;
249 250 251
}

template <typename ColumnType, typename IndexType>
252
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsertFrom(const IColumn & src, size_t n)
253
{
254 255 256
    if (is_nullable && src.isNullAt(n))
        return getNullValueIndex();

257 258 259 260 261
    auto ref = src.getDataAt(n);
    return uniqueInsertData(ref.data, ref.size);
}

template <typename ColumnType, typename IndexType>
262
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsertData(const char * pos, size_t length)
263
{
264 265 266
    if (!index)
        buildIndex();

267 268
    auto column = getRawColumnPtr();

269
    if (column->getDataAt(getDefaultValueIndex()) == StringRef(pos, length))
270 271
        return getDefaultValueIndex();

272
    auto size = static_cast<IndexType>(column->size());
273
    auto iter = index->find(StringRefWrapper<ColumnType>(StringRef(pos, length)));
274

275
    if (iter == index->end())
276
    {
277
        column->insertData(pos, length);
278
        return insertIntoMap(StringRefWrapper<ColumnType>(column, size), size);
279 280
    }

281
    return iter->second;
282 283 284
}

template <typename ColumnType, typename IndexType>
285
size_t ColumnUnique<ColumnType, IndexType>::uniqueInsertDataWithTerminatingZero(const char * pos, size_t length)
286 287
{
    if (std::is_same<ColumnType, ColumnString>::value)
288
        return uniqueInsertData(pos, length - 1);
289 290

    if (column_holder->valuesHaveFixedSize())
291
        return uniqueInsertData(pos, length);
292 293 294 295 296

    /// Don't know if data actually has terminating zero. So, insert it firstly.

    auto column = getRawColumnPtr();
    size_t prev_size = column->size();
297
    column->insertDataWithTerminatingZero(pos, length);
298 299 300 301 302 303 304

    if (column->compareAt(getDefaultValueIndex(), prev_size, *column, 1) == 0)
    {
        column->popBack(1);
        return getDefaultValueIndex();
    }

305
    auto position = insertIntoMap(StringRefWrapper<ColumnType>(column, prev_size), prev_size);
306
    if (position != prev_size)
307 308
        column->popBack(1);

309
    return static_cast<size_t>(position);
310 311 312
}

template <typename ColumnType, typename IndexType>
313
size_t ColumnUnique<ColumnType, IndexType>::uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos)
314 315 316 317 318 319 320 321 322 323 324
{
    auto column = getRawColumnPtr();
    size_t prev_size = column->size();
    new_pos = column->deserializeAndInsertFromArena(pos);

    if (column->compareAt(getDefaultValueIndex(), prev_size, *column, 1) == 0)
    {
        column->popBack(1);
        return getDefaultValueIndex();
    }

325
    auto index_pos = insertIntoMap(StringRefWrapper<ColumnType>(column, prev_size), prev_size);
326 327 328 329 330 331 332
    if (index_pos != prev_size)
        column->popBack(1);

    return static_cast<size_t>(index_pos);
}

template <typename ColumnType, typename IndexType>
333
ColumnPtr ColumnUnique<ColumnType, IndexType>::uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length)
334 335 336 337 338 339 340
{
    if (!index)
        buildIndex();

    const ColumnType * src_column;
    const NullMap * null_map = nullptr;

341
    if (src.isColumnNullable())
342 343 344 345 346 347 348 349 350 351
    {
        auto nullable_column = static_cast<const ColumnNullable *>(&src);
        src_column = static_cast<const ColumnType *>(&nullable_column->getNestedColumn());
        null_map = &nullable_column->getNullMapData();
    }
    else
        src_column = static_cast<const ColumnType *>(&src);

    auto column = getRawColumnPtr();
    auto positions_column = ColumnVector<IndexType>::create(length);
352
    auto & positions = positions_column->getData();
353 354 355 356 357 358

    size_t next_position = column->size();
    for (auto i : ext::range(0, length))
    {
        auto row = start + i;

N
Nikolai Kochetov 已提交
359
        if (null_map && (*null_map)[row])
360
            positions[i] = getNullValueIndex();
N
Nikolai Kochetov 已提交
361 362
        else if (column->compareAt(getDefaultValueIndex(), row, *src_column, 1) == 0)
            positions[i] = getDefaultValueIndex();
363 364
        else
        {
365
            auto it = index->find(StringRefWrapper<ColumnType>(src_column, row));
366 367 368
            if (it == index->end())
            {
                positions[i] = next_position;
369 370 371
                auto ref = src_column->getDataAt(row);
                column->insertData(ref.data, ref.size);
                (*index)[StringRefWrapper<ColumnType>(column, next_position)] = next_position;
372 373 374 375 376 377 378 379 380 381
                ++next_position;
            }
            else
                positions[i] = it->second;
        }
    }

    return positions_column;
}

382
}