IColumn.h 19.3 KB
Newer Older
1
#pragma once
A
Alexey Milovidov 已提交
2

3
#include <Core/Field.h>
4
#include <Common/COW.h>
5
#include <Common/PODArray.h>
6
#include <Common/Exception.h>
C
chertus 已提交
7
#include <Common/typeid_cast.h>
P
proller 已提交
8
#include <common/StringRef.h>
A
Alexey Milovidov 已提交
9

10

11 12 13
class SipHash;


A
Alexey Milovidov 已提交
14 15 16
namespace DB
{

17 18
namespace ErrorCodes
{
19 20 21
    extern const int CANNOT_GET_SIZE_OF_FIELD;
    extern const int NOT_IMPLEMENTED;
    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
22 23
}

24
class Arena;
25
class ColumnGathererStream;
26

27
/// Declares interface to store columns in memory.
28
class IColumn : public COW<IColumn>
A
Alexey Milovidov 已提交
29
{
30
private:
31
    friend class COW<IColumn>;
32 33

    /// Creates the same column with the same data.
34
    /// This is internal method to use from COW.
A
Alexey Milovidov 已提交
35 36
    /// It performs shallow copy with copy-ctor and not useful from outside.
    /// If you want to copy column for modification, look at 'mutate' method.
37
    virtual MutablePtr clone() const = 0;
38

39
public:
40
    /// Name of a Column. It is used in info messages.
41
    virtual std::string getName() const { return getFamilyName(); }
42 43 44

    /// Name of a Column kind, without parameters (example: FixedString, Array).
    virtual const char * getFamilyName() const = 0;
45 46

    /** If column isn't constant, returns nullptr (or itself).
M
maiha 已提交
47
      * If column is constant, transforms constant to full column (if column type allows such transform) and return it.
48
      */
49
    virtual Ptr convertToFullColumnIfConst() const { return getPtr(); }
50

51 52 53
    /// If column isn't ColumnLowCardinality, return itself.
    /// If column is ColumnLowCardinality, transforms is to full column.
    virtual Ptr convertToFullColumnIfLowCardinality() const { return getPtr(); }
54

55
    /// Creates empty column with the same type.
56
    virtual MutablePtr cloneEmpty() const { return cloneResized(0); }
57 58 59 60

    /// Creates column with the same type and specified size.
    /// If size is less current size, then data is cut.
    /// If size is greater, than default values are appended.
61
    virtual MutablePtr cloneResized(size_t /*size*/) const { throw Exception("Cannot cloneResized() column " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87

    /// Returns number of values in column.
    virtual size_t size() const = 0;

    /// There are no values in columns.
    bool empty() const { return size() == 0; }

    /// Returns value of n-th element in universal Field representation.
    /// Is used in rare cases, since creation of Field instance is expensive usually.
    virtual Field operator[](size_t n) const = 0;

    /// Like the previous one, but avoids extra copying if Field is in a container, for example.
    virtual void get(size_t n, Field & res) const = 0;

    /// If possible, returns pointer to memory chunk which contains n-th element (if it isn't possible, throws an exception)
    /// Is used to optimize some computations (in aggregation, for example).
    virtual StringRef getDataAt(size_t n) const = 0;

    /// Like getData, but has special behavior for columns that contain variable-length strings.
    /// Returns zero-ending memory chunk (i.e. its size is 1 byte longer).
    virtual StringRef getDataAtWithTerminatingZero(size_t n) const
    {
        return getDataAt(n);
    }

    /// If column stores integers, it returns n-th element transformed to UInt64 using static_cast.
88
    /// If column stores floating point numbers, bits of n-th elements are copied to lower bits of UInt64, the remaining bits are zeros.
89
    /// Is used to optimize some computations (in aggregation, for example).
A
Alexey Milovidov 已提交
90
    virtual UInt64 get64(size_t /*n*/) const
91 92 93 94
    {
        throw Exception("Method get64 is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

95 96 97 98 99 100 101
    /// If column stores native numeric type, it returns n-th element casted to Float64
    /// Is used in regression methods to cast each features into uniform type
    virtual Float64 getFloat64(size_t /*n*/) const
    {
        throw Exception("Method getFloat64 is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

102
    /** If column is numeric, return value of n-th element, casted to UInt64.
A
Alexey Milovidov 已提交
103
      * For NULL values of Nullable column it is allowed to return arbitrary value.
104 105
      * Otherwise throw an exception.
      */
A
Alexey Milovidov 已提交
106
    virtual UInt64 getUInt(size_t /*n*/) const
107 108 109 110
    {
        throw Exception("Method getUInt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

A
Alexey Milovidov 已提交
111
    virtual Int64 getInt(size_t /*n*/) const
112 113 114 115
    {
        throw Exception("Method getInt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

116
    virtual bool isDefaultAt(size_t n) const { return get64(n) == 0; }
A
Alexey Milovidov 已提交
117 118
    virtual bool isNullAt(size_t /*n*/) const { return false; }

A
Alexey Milovidov 已提交
119 120 121 122 123 124 125 126 127
    /** If column is numeric, return value of n-th element, casted to bool.
      * For NULL values of Nullable column returns false.
      * Otherwise throw an exception.
      */
    virtual bool getBool(size_t /*n*/) const
    {
        throw Exception("Method getBool is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

128 129
    /// Removes all elements outside of specified range.
    /// Is used in LIMIT operation, for example.
130
    virtual Ptr cut(size_t start, size_t length) const
131
    {
132
        MutablePtr res = cloneEmpty();
133
        res->insertRangeFrom(*this, start, length);
134
        return res;
135 136 137 138 139 140 141 142 143 144
    }

    /// Appends new value at the end of column (column's size is increased by 1).
    /// Is used to transform raw strings to Blocks (for example, inside input format parsers)
    virtual void insert(const Field & x) = 0;

    /// Appends n-th element from other column with the same type.
    /// Is used in merge-sort and merges. It could be implemented in inherited classes more optimally than default implementation.
    virtual void insertFrom(const IColumn & src, size_t n) { insert(src[n]); }

145
    /// Appends range of elements from other column with the same type.
146 147 148 149 150
    /// Could be used to concatenate columns.
    virtual void insertRangeFrom(const IColumn & src, size_t start, size_t length) = 0;

    /// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented).
    /// Is used to optimize some computations (in aggregation, for example).
151
    /// Parameter length could be ignored if column values have fixed size.
A
alesapin 已提交
152
    /// All data will be inserted as single element
153 154 155 156 157 158 159 160
    virtual void insertData(const char * pos, size_t length) = 0;

    /// Appends "default value".
    /// Is used when there are need to increase column size, but inserting value doesn't make sense.
    /// For example, ColumnNullable(Nested) absolutely ignores values of nested column if it is marked as NULL.
    virtual void insertDefault() = 0;

    /** Removes last n elements.
M
maiha 已提交
161
      * Is used to support exception-safety of several operations.
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
      *  For example, sometimes insertion should be reverted if we catch an exception during operation processing.
      * If column has less than n elements or n == 0 - undefined behavior.
      */
    virtual void popBack(size_t n) = 0;

    /** Serializes n-th element. Serialized element should be placed continuously inside Arena's memory.
      * Serialized value can be deserialized to reconstruct original object. Is used in aggregation.
      * The method is similar to getDataAt(), but can work when element's value cannot be mapped to existing continuous memory chunk,
      *  For example, to obtain unambiguous representation of Array of strings, strings data should be interleaved with their sizes.
      * Parameter begin should be used with Arena::allocContinue.
      */
    virtual StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const = 0;

    /// Deserializes a value that was serialized using IColumn::serializeValueIntoArena method.
    /// Returns pointer to the position after the read data.
    virtual const char * deserializeAndInsertFromArena(const char * pos) = 0;

    /// Update state of hash function with value of n-th element.
A
Alexey Milovidov 已提交
180
    /// On subsequent calls of this method for sequence of column values of arbitrary types,
181 182 183 184 185 186 187 188 189 190
    ///  passed bytes to hash must identify sequence of values unambiguously.
    virtual void updateHashWithValue(size_t n, SipHash & hash) const = 0;

    /** Removes elements that don't match the filter.
      * Is used in WHERE and HAVING operations.
      * If result_size_hint > 0, then makes advance reserve(result_size_hint) for the result column;
      *  if 0, then don't makes reserve(),
      *  otherwise (i.e. < 0), makes reserve() using size of source column.
      */
    using Filter = PaddedPODArray<UInt8>;
191
    virtual Ptr filter(const Filter & filt, ssize_t result_size_hint) const = 0;
192 193 194 195

    /// Permutes elements using specified permutation. Is used in sortings.
    /// limit - if it isn't 0, puts only first limit elements in the result.
    using Permutation = PaddedPODArray<size_t>;
196
    virtual Ptr permute(const Permutation & perm, size_t limit) const = 0;
197

N
Nikolai Kochetov 已提交
198 199
    /// Creates new column with values column[indexes[:limit]]. If limit is 0, all indexes are used.
    /// Indexes must be one of the ColumnUInt. For default implementation, see selectIndexImpl from ColumnsCommon.h
200
    virtual Ptr index(const IColumn & indexes, size_t limit) const = 0;
N
Nikolai Kochetov 已提交
201

202
    /** Compares (*this)[n] and rhs[m]. Column rhs should have the same type.
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
      * Returns negative number, 0, or positive number (*this)[n] is less, equal, greater than rhs[m] respectively.
      * Is used in sortings.
      *
      * If one of element's value is NaN or NULLs, then:
      * - if nan_direction_hint == -1, NaN and NULLs are considered as least than everything other;
      * - if nan_direction_hint ==  1, NaN and NULLs are considered as greatest than everything other.
      * For example, if nan_direction_hint == -1 is used by descending sorting, NaNs will be at the end.
      *
      * For non Nullable and non floating point types, nan_direction_hint is ignored.
      */
    virtual int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const = 0;

    /** Returns a permutation that sorts elements of this column,
      *  i.e. perm[i]-th element of source column should be i-th element of sorted column.
      * reverse - reverse ordering (acsending).
      * limit - if isn't 0, then only first limit elements of the result column could be sorted.
      * nan_direction_hint - see above.
      */
    virtual void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const = 0;

    /** Copies each element according offsets parameter.
      * (i-th element should be copied offsets[i] - offsets[i - 1] times.)
      * It is necessary in ARRAY JOIN operation.
      */
227 228
    using Offset = UInt64;
    using Offsets = PaddedPODArray<Offset>;
229
    virtual Ptr replicate(const Offsets & offsets) const = 0;
230 231 232 233 234 235 236

    /** Split column to smaller columns. Each value goes to column index, selected by corresponding element of 'selector'.
      * Selector must contain values from 0 to num_columns - 1.
      * For default implementation, see scatterImpl.
      */
    using ColumnIndex = UInt64;
    using Selector = PaddedPODArray<ColumnIndex>;
237
    virtual std::vector<MutablePtr> scatter(ColumnIndex num_columns, const Selector & selector) const = 0;
238

239 240 241 242 243 244
    /// Insert data from several other columns according to source mask (used in vertical merge).
    /// For now it is a helper to de-virtualize calls to insert*() functions inside gather loop
    /// (descendants should call gatherer_stream.gather(*this) to implement this function.)
    /// TODO: interface decoupled from ColumnGathererStream that allows non-generic specializations.
    virtual void gather(ColumnGathererStream & gatherer_stream) = 0;

245
    /** Computes minimum and maximum element of the column.
M
maiha 已提交
246 247
      * In addition to numeric types, the function is completely implemented for Date and DateTime.
      * For strings and arrays function should return default value.
248 249 250 251 252 253 254
      *  (except for constant columns; they should return value of the constant).
      * If column is empty function should return default value.
      */
    virtual void getExtremes(Field & min, Field & max) const = 0;

    /// Reserves memory for specified amount of elements. If reservation isn't possible, does nothing.
    /// It affects performance only (not correctness).
255
    virtual void reserve(size_t /*n*/) {}
256 257 258 259 260 261

    /// Size of column data in memory (may be approximate) - for profiling. Zero, if could not be determined.
    virtual size_t byteSize() const = 0;

    /// Size of memory, allocated for column.
    /// This is greater or equals to byteSize due to memory reservation in containers.
262
    /// Zero, if could not be determined.
263
    virtual size_t allocatedBytes() const = 0;
264

265 266 267 268
    /// Make memory region readonly with mprotect if it is large enough.
    /// The operation is slow and performed only for debug builds.
    virtual void protect() {}

A
Alexey Milovidov 已提交
269 270
    /// If the column contains subcolumns (such as Array, Nullable, etc), do callback on them.
    /// Shallow: doesn't do recursive calls; don't do call for itself.
271
    using ColumnCallback = std::function<void(WrappedPtr&)>;
272 273
    virtual void forEachSubcolumn(ColumnCallback) {}

274 275 276 277 278 279 280
    /// Columns have equal structure.
    /// If true - you can use "compareAt", "insertFrom", etc. methods.
    virtual bool structureEquals(const IColumn &) const
    {
        throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

281

282
    MutablePtr mutate() const &&
283
    {
284
        MutablePtr res = shallowMutate();
285
        res->forEachSubcolumn([](WrappedPtr & subcolumn) { subcolumn = std::move(*subcolumn).mutate(); });
286 287 288 289
        return res;
    }


290 291 292 293 294 295 296
    /** Some columns can contain another columns inside.
      * So, we have a tree of columns. But not all combinations are possible.
      * There are the following rules:
      *
      * ColumnConst may be only at top. It cannot be inside any column.
      * ColumnNullable can contain only simple columns.
      */
297 298 299

    /// Various properties on behaviour of column type.

C
chertus 已提交
300 301 302
    /// True if column contains something nullable inside. It's true for ColumnNullable, can be true or false for ColumnConst, etc.
    virtual bool isNullable() const { return false; }

303 304
    /// It's a special kind of column, that contain single value, but is not a ColumnConst.
    virtual bool isDummy() const { return false; }
305

A
Alexey Milovidov 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
    /** Memory layout properties.
      *
      * Each value of a column can be placed in memory contiguously or not.
      *
      * Example: simple columns like UInt64 or FixedString store their values contiguously in single memory buffer.
      *
      * Example: Tuple store values of each component in separate subcolumn, so the values of Tuples with at least two components are not contiguous.
      * Another example is Nullable. Each value have null flag, that is stored separately, so the value is not contiguous in memory.
      *
      * There are some important cases, when values are not stored contiguously, but for each value, you can get contiguous memory segment,
      *  that will unambiguously identify the value. In this case, methods getDataAt and insertData are implemented.
      * Example: String column: bytes of strings are stored concatenated in one memory buffer
      *  and offsets to that buffer are stored in another buffer. The same is for Array of fixed-size contiguous elements.
      *
      * To avoid confusion between these cases, we don't have isContiguous method.
      */

323 324
    /// Values in column have fixed size (including the case when values span many memory segments).
    virtual bool valuesHaveFixedSize() const { return isFixedAndContiguous(); }
325

326 327
    /// Values in column are represented as continuous memory segment of fixed size. Implies valuesHaveFixedSize.
    virtual bool isFixedAndContiguous() const { return false; }
328

329 330 331
    /// If isFixedAndContiguous, returns the underlying data array, otherwise throws an exception.
    virtual StringRef getRawData() const { throw Exception("Column " + getName() + " is not a contiguous block of memory", ErrorCodes::NOT_IMPLEMENTED); }

332 333 334 335 336 337
    /// If valuesHaveFixedSize, returns size of value, otherwise throw an exception.
    virtual size_t sizeOfValueIfFixed() const { throw Exception("Values of column " + getName() + " are not fixed size.", ErrorCodes::CANNOT_GET_SIZE_OF_FIELD); }

    /// Column is ColumnVector of numbers or ColumnConst of it. Note that Nullable columns are not numeric.
    /// Implies isFixedAndContiguous.
    virtual bool isNumeric() const { return false; }
338

339 340 341
    /// If the only value column can contain is NULL.
    /// Does not imply type of object, because it can be ColumnNullable(ColumnNothing) or ColumnConst(ColumnNullable(ColumnNothing))
    virtual bool onlyNull() const { return false; }
342

343 344
    /// Can be inside ColumnNullable.
    virtual bool canBeInsideNullable() const { return false; }
345

346
    virtual bool lowCardinality() const { return false; }
347

348

349 350 351
    virtual ~IColumn() = default;
    IColumn() = default;
    IColumn(const IColumn &) = default;
352

A
Alexey Milovidov 已提交
353 354
    /** Print column name, size, and recursively print all subcolumns.
      */
355 356
    String dumpStructure() const;

357 358
protected:

359 360 361
    /// Template is to devirtualize calls to insertFrom method.
    /// In derived classes (that use final keyword), implement scatter method as call to scatterImpl.
    template <typename Derived>
362
    std::vector<MutablePtr> scatterImpl(ColumnIndex num_columns, const Selector & selector) const
363 364
    {
        size_t num_rows = size();
365

366
        if (num_rows != selector.size())
367 368 369
            throw Exception(
                    "Size of selector: " + std::to_string(selector.size()) + " doesn't match size of column: " + std::to_string(num_rows),
                    ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
370

371
        std::vector<MutablePtr> columns(num_columns);
372 373
        for (auto & column : columns)
            column = cloneEmpty();
374

375
        {
376
            size_t reserve_size = num_rows * 1.1 / num_columns;    /// 1.1 is just a guess. Better to use n-sigma rule.
377

378 379 380 381
            if (reserve_size > 1)
                for (auto & column : columns)
                    column->reserve(reserve_size);
        }
382

383 384
        for (size_t i = 0; i < num_rows; ++i)
            static_cast<Derived &>(*columns[selector[i]]).insertFrom(*this, i);
385

386 387
        return columns;
    }
A
Alexey Milovidov 已提交
388 389
};

390
using ColumnPtr = IColumn::Ptr;
A
Alexey Milovidov 已提交
391
using MutableColumnPtr = IColumn::MutablePtr;
392
using Columns = std::vector<ColumnPtr>;
393
using MutableColumns = std::vector<MutableColumnPtr>;
394 395

using ColumnRawPtrs = std::vector<const IColumn *>;
396
//using MutableColumnRawPtrs = std::vector<IColumn *>;
A
Alexey Milovidov 已提交
397

N
Nikolai Kochetov 已提交
398 399 400 401 402 403 404 405 406 407 408 409
template <typename ... Args>
struct IsMutableColumns;

template <typename Arg, typename ... Args>
struct IsMutableColumns<Arg, Args ...>
{
    static const bool value = std::is_assignable<MutableColumnPtr &&, Arg>::value && IsMutableColumns<Args ...>::value;
};

template <>
struct IsMutableColumns<> { static const bool value = true; };

C
chertus 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434

template <typename Type>
const Type * checkAndGetColumn(const IColumn & column)
{
    return typeid_cast<const Type *>(&column);
}

template <typename Type>
const Type * checkAndGetColumn(const IColumn * column)
{
    return typeid_cast<const Type *>(column);
}

template <typename Type>
bool checkColumn(const IColumn & column)
{
    return checkAndGetColumn<Type>(&column);
}

template <typename Type>
bool checkColumn(const IColumn * column)
{
    return checkAndGetColumn<Type>(column);
}

C
chertus 已提交
435 436 437 438 439
/// True if column's an ColumnConst instance. It's just a syntax sugar for type check.
bool isColumnConst(const IColumn & column);

/// True if column's an ColumnNullable instance. It's just a syntax sugar for type check.
bool isColumnNullable(const IColumn & column);
440

A
Alexey Milovidov 已提交
441
}