diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt index b4bf4f55466ab6ccced2429ee0b9230d23c8c940..7dfb9bc10c0e1a91790d64c70583f61a4d53c5e8 100644 --- a/base/common/CMakeLists.txt +++ b/base/common/CMakeLists.txt @@ -47,6 +47,10 @@ endif() target_include_directories(common PUBLIC .. ${CMAKE_CURRENT_BINARY_DIR}/..) +if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) + target_link_libraries(common PUBLIC -Wl,-U,_inside_main) +endif() + # Allow explicit fallback to readline if (NOT ENABLE_REPLXX AND ENABLE_READLINE) message (STATUS "Attempt to fallback to readline explicitly") diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 8d393465b82c98c9d4d2a83fc740d3ed4e290d1c..363f281584eff97830d08bfc3deee0080bd6063b 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -853,15 +853,43 @@ public: { if (hours == 1) return toStartOfHour(t); + + /** We will round the hour number since the midnight. + * It may split the day into non-equal intervals. + * For example, if we will round to 11-hour interval, + * the day will be split to the intervals 00:00:00..10:59:59, 11:00:00..21:59:59, 22:00:00..23:59:59. + * In case of daylight saving time or other transitions, + * the intervals can be shortened or prolonged to the amount of transition. + */ + UInt64 seconds = hours * 3600; - t = roundDown(t, seconds); + const LUTIndex index = findIndex(t); + const Values & values = lut[index]; - if (t >= 0 && offset_is_whole_number_of_hours_during_epoch) - return t; + time_t time = t - values.date; + if (time >= values.time_at_offset_change()) + { + /// Align to new hour numbers before rounding. + time += values.amount_of_offset_change(); + time = time / seconds * seconds; - /// TODO check if it's correct. - return toStartOfHour(t); + /// Should subtract the shift back but only if rounded time is not before shift. + if (time >= values.time_at_offset_change()) + { + time -= values.amount_of_offset_change(); + + /// With cutoff at the time of the shift. Otherwise we may end up with something like 23:00 previous day. + if (time < values.time_at_offset_change()) + time = values.time_at_offset_change(); + } + } + else + { + time = time / seconds * seconds; + } + + return values.date + time; } inline time_t toStartOfMinuteInterval(time_t t, UInt64 minutes) const @@ -869,6 +897,14 @@ public: if (minutes == 1) return toStartOfMinute(t); + /** In contrast to "toStartOfHourInterval" function above, + * the minute intervals are not aligned to the midnight. + * You will get unexpected results if for example, you round down to 60 minute interval + * and there was a time shift to 30 minutes. + * + * But this is not specified in docs and can be changed in future. + */ + UInt64 seconds = 60 * minutes; return roundDown(t, seconds); } diff --git a/base/daemon/CMakeLists.txt b/base/daemon/CMakeLists.txt index 26d59a57e7f06d3254ee4dd8768f8deec038e485..6ef87db6a6148cbd27389586be9b8378dfd79d1e 100644 --- a/base/daemon/CMakeLists.txt +++ b/base/daemon/CMakeLists.txt @@ -5,6 +5,11 @@ add_library (daemon ) target_include_directories (daemon PUBLIC ..) + +if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) + target_link_libraries (daemon PUBLIC -Wl,-undefined,dynamic_lookup) +endif() + target_link_libraries (daemon PUBLIC loggers PRIVATE clickhouse_common_io clickhouse_common_config common ${EXECINFO_LIBRARIES}) if (USE_SENTRY) diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 414eb23d04445f10bfd54dfbe2aeab7c1ade89a8..d22516eab0add60344a7c33f4740cad90d939c51 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -4,6 +4,21 @@ ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" ARG version=21.4.1.* ARG gosu_ver=1.10 +# set non-empty deb_location_url url to create a docker image +# from debs created by CI build, for example: +# docker build . --network host --build-arg version="21.4.1.6282" --build-arg deb_location_url="https://clickhouse-builds.s3.yandex.net/21852/069cfbff388b3d478d1a16dc7060b48073f5d522/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/" -t filimonovq/clickhouse-server:pr21852 +ARG deb_location_url="" + +# set non-empty single_binary_location_url to create docker image +# from a single binary url (useful for non-standard builds - with sanitizers, for arm64). +# for example (run on aarch64 server): +# docker build . --network host --build-arg single_binary_location_url="https://builds.clickhouse.tech/master/aarch64/clickhouse" -t altinity/clickhouse-server:master-testing-arm +# note: clickhouse-odbc-bridge is not supported there. +ARG single_binary_location_url="" + +# see https://github.com/moby/moby/issues/4032#issuecomment-192327844 +ARG DEBIAN_FRONTEND=noninteractive + # user/group precreated explicitly with fixed uid/gid on purpose. # It is especially important for rootless containers: in that case entrypoint # can't do chown and owners of mounted volumes should be configured externally. @@ -19,20 +34,37 @@ RUN groupadd -r clickhouse --gid=101 \ ca-certificates \ dirmngr \ gnupg \ + locales \ + wget \ + tzdata \ && mkdir -p /etc/apt/sources.list.d \ && apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 \ && echo $repository > /etc/apt/sources.list.d/clickhouse.list \ - && apt-get update \ - && env DEBIAN_FRONTEND=noninteractive \ - apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ - && env DEBIAN_FRONTEND=noninteractive \ - apt-get install --allow-unauthenticated --yes --no-install-recommends \ - clickhouse-common-static=$version \ - clickhouse-client=$version \ - clickhouse-server=$version \ - locales \ - wget \ - tzdata \ + && if [ -n "$deb_location_url" ]; then \ + echo "installing from custom url with deb packages: $deb_location_url" \ + rm -rf /tmp/clickhouse_debs \ + && mkdir -p /tmp/clickhouse_debs \ + && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-common-static_${version}_amd64.deb" -P /tmp/clickhouse_debs \ + && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-client_${version}_all.deb" -P /tmp/clickhouse_debs \ + && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-server_${version}_all.deb" -P /tmp/clickhouse_debs \ + && dpkg -i /tmp/clickhouse_debs/*.deb ; \ + elif [ -n "$single_binary_location_url" ]; then \ + echo "installing from single binary url: $single_binary_location_url" \ + && rm -rf /tmp/clickhouse_binary \ + && mkdir -p /tmp/clickhouse_binary \ + && wget --progress=bar:force:noscroll "$single_binary_location_url" -O /tmp/clickhouse_binary/clickhouse \ + && chmod +x /tmp/clickhouse_binary/clickhouse \ + && /tmp/clickhouse_binary/clickhouse install --user "clickhouse" --group "clickhouse" ; \ + else \ + echo "installing from repository: $repository" \ + && apt-get update \ + && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ + && apt-get install --allow-unauthenticated --yes --no-install-recommends \ + clickhouse-common-static=$version \ + clickhouse-client=$version \ + clickhouse-server=$version ; \ + fi \ + && clickhouse-local -q 'SELECT * FROM system.build_options' \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 81e04bd7874e0f6138eadb0a518f7745b95f64b3..4486b0d9d7fd1360cafe1961be45a74cf16a7fc0 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -38,9 +38,6 @@ if ! $gosu test -f "$CLICKHOUSE_CONFIG" -a -r "$CLICKHOUSE_CONFIG"; then exit 1 fi -# port is needed to check if clickhouse-server is ready for connections -HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)" - # get CH directories locations DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=path || true)" TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)" @@ -108,6 +105,9 @@ EOT fi if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then + # port is needed to check if clickhouse-server is ready for connections + HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)" + # Listen only on localhost until the initialization is done $gosu /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 & pid="$!" diff --git a/docs/en/engines/database-engines/materialize-mysql.md b/docs/en/engines/database-engines/materialize-mysql.md index 2e361cc82f0c8cec96294954237c3061ba7a8992..69d3122c2681fcbc6c09e414f6254e6795449c88 100644 --- a/docs/en/engines/database-engines/materialize-mysql.md +++ b/docs/en/engines/database-engines/materialize-mysql.md @@ -69,7 +69,7 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ - MySQL `INSERT` query is converted into `INSERT` with `_sign=1`. -- MySQl `DELETE` query is converted into `INSERT` with `_sign=-1`. +- MySQL `DELETE` query is converted into `INSERT` with `_sign=-1`. - MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1`. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 92587dbcf8f4970bb000e9dbc61f4b522145ba40..89ccee691e493d71bc50db3890b5d24ac863bdfb 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1514,6 +1514,14 @@ FORMAT PrettyCompactMonoBlock Default value: 0 +## optimize_skip_unused_shards_limit {#optimize-skip-unused-shards-limit} + +Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. + +Too many values may require significant amount for processing, while the benefit is doubtful, since if you have huge number of values in `IN (...)`, then most likely the query will be sent to all shards anyway. + +Default value: 1000 + ## optimize_skip_unused_shards {#optimize-skip-unused-shards} Enables or disables skipping of unused shards for [SELECT](../../sql-reference/statements/select/index.md) queries that have sharding key condition in `WHERE/PREWHERE` (assuming that the data is distributed by sharding key, otherwise does nothing). @@ -2728,11 +2736,11 @@ Default value: `0`. ## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} -Enables or disables truncate before insert in file engine tables. +Enables or disables truncate before insert in [File](../../engines/table-engines/special/file.md) engine tables. Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` replaces existing content of the file with the new data. Default value: `0`. @@ -2747,4 +2755,39 @@ Possible values: Default value: `0`. +## allow_experimental_live_view {#allow-experimental-live-view} + +Allows creation of experimental [live views](../../sql-reference/statements/create/view.md#live-view). + +Possible values: + +- 0 — Working with live views is disabled. +- 1 — Working with live views is enabled. + +Default value: `0`. + +## live_view_heartbeat_interval {#live-view-heartbeat-interval} + +Sets the heartbeat interval in seconds to indicate [live view](../../sql-reference/statements/create/view.md#live-view) is alive . + +Default value: `15`. + +## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} + +Sets the maximum number of inserted blocks after which mergeable blocks are dropped and query for [live view](../../sql-reference/statements/create/view.md#live-view) is re-executed. + +Default value: `64`. + +## temporary_live_view_timeout {#temporary-live-view-timeout} + +Sets the interval in seconds after which [live view](../../sql-reference/statements/create/view.md#live-view) with timeout is deleted. + +Default value: `5`. + +## periodic_live_view_refresh {#periodic-live-view-refresh} + +Sets the interval in seconds after which periodically refreshed [live view](../../sql-reference/statements/create/view.md#live-view) is forced to refresh. + +Default value: `60`. + [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 49463078c544cdc05f0e258845a0832e95da333a..633db355d4a4758c5538cfde7e87ea5b7b01d348 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -68,7 +68,7 @@ To delete a view, use [DROP VIEW](../../../sql-reference/statements/drop.md#drop !!! important "Important" This is an experimental feature that may change in backwards-incompatible ways in the future releases. - Enable usage of live views and `WATCH` query using `set allow_experimental_live_view = 1`. + Enable usage of live views and `WATCH` query using [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view) setting. Input the command `set allow_experimental_live_view = 1`. ```sql @@ -90,7 +90,9 @@ Live views work similarly to how a query in a distributed table works. But inste See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. -You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query +### Monitoring Changes {#live-view-monitoring} + +You can monitor changes in the `LIVE VIEW` query result using [WATCH](../../../sql-reference/statements/watch.md) query. ```sql WATCH [db.]live_view @@ -102,11 +104,10 @@ WATCH [db.]live_view CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; CREATE LIVE VIEW lv AS SELECT sum(x) FROM mt; ``` - Watch a live view while doing a parallel insert into the source table. ```sql -WATCH lv +WATCH lv; ``` ```bash @@ -128,16 +129,16 @@ INSERT INTO mt VALUES (2); INSERT INTO mt VALUES (3); ``` -or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events. +Or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events. ```sql -WATCH [db.]live_view EVENTS +WATCH [db.]live_view EVENTS; ``` **Example:** ```sql -WATCH lv EVENTS +WATCH lv EVENTS; ``` ```bash @@ -163,15 +164,15 @@ SELECT * FROM [db.]live_view WHERE ... You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement. -### With Timeout {#live-view-with-timeout} +### WITH TIMEOUT Clause {#live-view-with-timeout} -When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. +When a live view is created with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. ```sql CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... ``` -If the timeout value is not specified then the value specified by the `temporary_live_view_timeout` setting is used. +If the timeout value is not specified then the value specified by the [temporary_live_view_timeout](../../../operations/settings/settings.md#temporary-live-view-timeout) setting is used. **Example:** @@ -180,7 +181,7 @@ CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; CREATE LIVE VIEW lv WITH TIMEOUT 15 AS SELECT sum(x) FROM mt; ``` -### With Refresh {#live-view-with-refresh} +### WITH REFRESH Clause {#live-view-with-refresh} When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger. @@ -188,7 +189,7 @@ When a live view is created with a `WITH REFRESH` clause then it will be automat CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... ``` -If the refresh value is not specified then the value specified by the `periodic_live_view_refresh` setting is used. +If the refresh value is not specified then the value specified by the [periodic_live_view_refresh](../../../operations/settings/settings.md#periodic-live-view-refresh) setting is used. **Example:** @@ -231,7 +232,7 @@ WATCH lv Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv doesn't exist.. ``` -### Usage +### Usage {#live-view-usage} Most common uses of live view tables include: @@ -240,15 +241,4 @@ Most common uses of live view tables include: - Watching for table changes and triggering a follow-up select queries. - Watching metrics from system tables using periodic refresh. -### Settings {#live-view-settings} - -You can use the following settings to control the behaviour of live views. - -- `allow_experimental_live_view` - enable live views. Default is `0`. -- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default is `15` seconds. -- `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which - mergeable blocks are dropped and query is re-executed. Default is `64` inserts. -- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default is `5` seconds. -- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default is `60` seconds. - [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/create/view/) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 4da586259d27b19e8ce9e58c4c0df02854c2db33..be793d30f3d088f7125c0cfda66f277bcac5d5bc 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -17,19 +17,21 @@ WATCH [db.]live_view [FORMAT format] ``` -The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [live view](./create/view.md#live-view). +The `WATCH` query performs continuous data retrieval from a [LIVE VIEW](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [LIVE VIEW](./create/view.md#live-view). ```sql -WATCH [db.]live_view +WATCH [db.]live_view [EVENTS] [LIMIT n] [FORMAT format] ``` +## Virtual columns {#watch-virtual-columns} + The virtual `_version` column in the query result indicates the current result version. **Example:** ```sql CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); -WATCH lv +WATCH lv; ``` ```bash @@ -47,6 +49,8 @@ WATCH lv By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. +**Example:** + ```sql INSERT INTO [db.]table WATCH [db.]live_view ... ``` @@ -56,14 +60,14 @@ INSERT INTO [db.]table WATCH [db.]live_view ... The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query result version. ```sql -WATCH [db.]live_view EVENTS +WATCH [db.]live_view EVENTS; ``` **Example:** ```sql CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); -WATCH lv EVENTS +WATCH lv EVENTS; ``` ```bash @@ -78,17 +82,17 @@ WATCH lv EVENTS ## LIMIT Clause {#limit-clause} -The `LIMIT n` clause species the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated. +The `LIMIT n` clause specifies the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query result is evaluated. ```sql -WATCH [db.]live_view LIMIT 1 +WATCH [db.]live_view LIMIT 1; ``` **Example:** ```sql CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); -WATCH lv EVENTS LIMIT 1 +WATCH lv EVENTS LIMIT 1; ``` ```bash @@ -102,5 +106,4 @@ WATCH lv EVENTS LIMIT 1 The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause). !!! info "Note" - The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. - + The [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress) format should be used when watching [LIVE VIEW](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 456602ce7aacf517fceb09430d8ac49927da9b7c..9d3ea4a809a2463051bffcc8b36aeab83af56ba6 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -2615,14 +2615,69 @@ SELECT * FROM test2; Обратите внимание на то, что эта настройка влияет на поведение [материализованных представлений](../../sql-reference/statements/create/view.md#materialized) и БД [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md). +## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} + +Включает или отключает возможность выполнять запрос `SELECT` к таблице на движке [File](../../engines/table-engines/special/file.md), не содержащей файл. + +Возможные значения: +- 0 — запрос `SELECT` генерирует исключение. +- 1 — запрос `SELECT` возвращает пустой результат. + +Значение по умолчанию: `0`. + +## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} + +Включает или выключает удаление данных из таблицы до вставки в таблицу на движке [File](../../engines/table-engines/special/file.md). + +Возможные значения: +- 0 — запрос `INSERT` добавляет данные в конец файла после существующих. +- 1 — `INSERT` удаляет имеющиеся в файле данные и замещает их новыми. + +Значение по умолчанию: `0`. + ## allow_experimental_geo_types {#allow-experimental-geo-types} Разрешает использование экспериментальных типов данных для работы с [географическими структурами](../../sql-reference/data-types/geo.md). Возможные значения: +- 0 — использование типов данных для работы с географическими структурами не поддерживается. +- 1 — использование типов данных для работы с географическими структурами поддерживается. -- 0 — Использование типов данных для работы с географическими структурами не поддерживается. -- 1 — Использование типов данных для работы с географическими структурами поддерживается. +Значение по умолчанию: `0`. + +## allow_experimental_live_view {#allow-experimental-live-view} + +Включает экспериментальную возможность использования [LIVE-представлений](../../sql-reference/statements/create/view.md#live-view). + +Возможные значения: +- 0 — живые представления не поддерживаются. +- 1 — живые представления поддерживаются. Значение по умолчанию: `0`. + +## live_view_heartbeat_interval {#live-view-heartbeat-interval} + +Задает интервал в секундах для периодической проверки существования [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view). + +Значение по умолчанию: `15`. + +## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} + +Задает наибольшее число вставок, после которых запрос на формирование [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view) исполняется снова. + +Значение по умолчанию: `64`. + +## temporary_live_view_timeout {#temporary-live-view-timeout} + +Задает время в секундах, после которого [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view) удаляется. + +Значение по умолчанию: `5`. + +## periodic_live_view_refresh {#periodic-live-view-refresh} + +Задает время в секундах, по истечении которого [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view) с установленным автообновлением обновляется. + +Значение по умолчанию: `60`. + +[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index 51cb14e2e8c65c5dd003a064ed5c23332f76e4b8..4e34b5e3b6ee4e78e6cf40907b9086a1b33e0dae 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -13,7 +13,7 @@ toc_title: "Представление" CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ... ``` -Обычные представления не хранят никаких данных, они выполняют чтение данных из другой таблицы при каждом доступе. Другими словами, обычное представление - это не что иное, как сохраненный запрос. При чтении данных из представления этот сохраненный запрос используется как подзапрос в секции [FROM](../../../sql-reference/statements/select/from.md). +Обычные представления не хранят никаких данных, они выполняют чтение данных из другой таблицы при каждом доступе. Другими словами, обычное представление — это не что иное, как сохраненный запрос. При чтении данных из представления этот сохраненный запрос используется как подзапрос в секции [FROM](../../../sql-reference/statements/select/from.md). Для примера, пусть вы создали представление: @@ -43,12 +43,12 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na При создании материализованного представления без использования `TO [db].[table]`, нужно обязательно указать `ENGINE` - движок таблицы для хранения данных. -При создании материализованного представления с испольованием `TO [db].[table]`, нельзя указывать `POPULATE` +При создании материализованного представления с испольованием `TO [db].[table]`, нельзя указывать `POPULATE`. Материализованное представление устроено следующим образом: при вставке данных в таблицу, указанную в SELECT-е, кусок вставляемых данных преобразуется этим запросом SELECT, и полученный результат вставляется в представление. !!! important "Важно" - Материализованные представлени в ClickHouse больше похожи на `after insert` триггеры. Если в запросе материализованного представления есть агрегирование, оно применяется только к вставляемому блоку записей. Любые изменения существующих данных исходной таблицы (например обновление, удаление, удаление раздела и т.д.) не изменяют материализованное представление. + Материализованные представления в ClickHouse больше похожи на `after insert` триггеры. Если в запросе материализованного представления есть агрегирование, оно применяется только к вставляемому блоку записей. Любые изменения существующих данных исходной таблицы (например обновление, удаление, удаление раздела и т.д.) не изменяют материализованное представление. Если указано `POPULATE`, то при создании представления, в него будут вставлены имеющиеся данные таблицы, как если бы был сделан запрос `CREATE TABLE ... AS SELECT ...` . Иначе, представление будет содержать только данные, вставляемые в таблицу после создания представления. Не рекомендуется использовать POPULATE, так как вставляемые в таблицу данные во время создания представления, не попадут в него. @@ -56,9 +56,177 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na Недоработано выполнение запросов `ALTER` над материализованными представлениями, поэтому они могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления. -Обратите внимание, что работа материлизованного представления находится под влиянием настройки [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert). Перед вставкой данных в таблицу происходит их слияние. +Обратите внимание, что работа материализованного представления находится под влиянием настройки [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert). Перед вставкой данных в таблицу происходит их слияние. Представления выглядят так же, как обычные таблицы. Например, они перечисляются в результате запроса `SHOW TABLES`. Чтобы удалить представление, следует использовать [DROP VIEW](../../../sql-reference/statements/drop.md#drop-view). Впрочем, `DROP TABLE` тоже работает для представлений. +## LIVE-представления {#live-view} + +!!! important "Важно" + Представления `LIVE VIEW` являются экспериментальной возможностью. Их использование может повлечь потерю совместимости в будущих версиях. + Чтобы использовать `LIVE VIEW` и запросы `WATCH`, включите настройку [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view). + +```sql +CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... +``` +`LIVE VIEW` хранит результат запроса [SELECT](../../../sql-reference/statements/select/index.md), указанного при создании, и обновляется сразу же при изменении этого результата. Конечный результат запроса и промежуточные данные, из которых формируется результат, хранятся в оперативной памяти, и это обеспечивает высокую скорость обработки для повторяющихся запросов. LIVE-представления могут отправлять push-уведомления при изменении результата исходного запроса `SELECT`. Для этого используйте запрос [WATCH](../../../sql-reference/statements/watch.md). + +Изменение `LIVE VIEW` запускается при вставке данных в таблицу, указанную в исходном запросе `SELECT`. + +LIVE-представления работают по тому же принципу, что и распределенные таблицы. Но вместо объединения отдельных частей данных с разных серверов, LIVE-представления объединяют уже имеющийся результат с новыми данными. Если в исходном запросе LIVE-представления есть вложенный подзапрос, его результаты не кешируются, в кеше хранится только результат основного запроса. + +!!! info "Ограничения" + - [Табличные функции](../../../sql-reference/table-functions/index.md) в основном запросе не поддерживаются. + - Таблицы, не поддерживающие изменение с помощью запроса `INSERT`, такие как [словари](../../../sql-reference/dictionaries/index.md) и [системные таблицы](../../../operations/system-tables/index.md), а также [нормальные представления](#normal) или [материализованные представления](#materialized), не запускают обновление LIVE-представления. + - В LIVE-представлениях могут использоваться только такие запросы, которые объединяют результаты по старым и новым данным. LIVE-представления не работают с запросами, требующими полного пересчета данных или агрегирования с сохранением состояния. + - `LIVE VIEW` не работает для реплицируемых и распределенных таблиц, добавление данных в которые происходит на разных узлах. + - `LIVE VIEW` не обновляется, если в исходном запросе используются несколько таблиц. + + В случаях, когда `LIVE VIEW` не обновляется автоматически, чтобы обновлять его принудительно с заданной периодичностью, используйте [WITH REFRESH](#live-view-with-refresh). + +### Отслеживание изменений {#live-view-monitoring} + +Для отслеживания изменений LIVE-представления используйте запрос [WATCH](../../../sql-reference/statements/watch.md). + + +**Пример:** + +```sql +CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; +CREATE LIVE VIEW lv AS SELECT sum(x) FROM mt; +``` +Отслеживаем изменения LIVE-представления при вставке данных в исходную таблицу. + +```sql +WATCH lv; +``` + +```bash +┌─sum(x)─┬─_version─┐ +│ 1 │ 1 │ +└────────┴──────────┘ +┌─sum(x)─┬─_version─┐ +│ 2 │ 2 │ +└────────┴──────────┘ +┌─sum(x)─┬─_version─┐ +│ 6 │ 3 │ +└────────┴──────────┘ +... +``` + +```sql +INSERT INTO mt VALUES (1); +INSERT INTO mt VALUES (2); +INSERT INTO mt VALUES (3); +``` + +Для получения списка изменений используйте ключевое слово [EVENTS](../../../sql-reference/statements/watch.md#events-clause). + + +```sql +WATCH lv EVENTS; +``` + +```bash +┌─version─┐ +│ 1 │ +└─────────┘ +┌─version─┐ +│ 2 │ +└─────────┘ +┌─version─┐ +│ 3 │ +└─────────┘ +... +``` + +Для работы с LIVE-представлениями, как и с любыми другими, можно использовать запросы [SELECT](../../../sql-reference/statements/select/index.md). Если результат запроса кеширован, он будет возвращен немедленно, без обращения к исходным таблицам представления. + +```sql +SELECT * FROM [db.]live_view WHERE ... +``` + +### Принудительное обновление {#live-view-alter-refresh} + +Чтобы принудительно обновить LIVE-представление, используйте запрос `ALTER LIVE VIEW [db.]table_name REFRESH`. + +### Секция WITH TIMEOUT {#live-view-with-timeout} + +LIVE-представление, созданное с параметром `WITH TIMEOUT`, будет автоматически удалено через определенное количество секунд с момента предыдущего запроса [WATCH](../../../sql-reference/statements/watch.md), примененного к данному LIVE-представлению. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... +``` + +Если временной промежуток не указан, используется значение настройки [temporary_live_view_timeout](../../../operations/settings/settings.md#temporary-live-view-timeout). + +**Пример:** + +```sql +CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; +CREATE LIVE VIEW lv WITH TIMEOUT 15 AS SELECT sum(x) FROM mt; +``` + +### Секция WITH REFRESH {#live-view-with-refresh} + +LIVE-представление, созданное с параметром `WITH REFRESH`, будет автоматически обновляться через указанные промежутки времени, начиная с момента последнего обновления. + +```sql +CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... +``` + +Если значение временного промежутка не задано, используется значение [periodic_live_view_refresh](../../../operations/settings/settings.md#periodic-live-view-refresh). + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv; +``` + +```bash +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 08:47:05 │ 1 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 08:47:10 │ 2 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 08:47:15 │ 3 │ +└─────────────────────┴──────────┘ +``` + +Параметры `WITH TIMEOUT` и `WITH REFRESH` можно сочетать с помощью `AND`. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ... +``` + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH TIMEOUT 15 AND REFRESH 5 AS SELECT now(); +``` + +По истечении 15 секунд представление будет автоматически удалено, если нет активного запроса `WATCH`. + +```sql +WATCH lv; +``` + +``` +Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv doesn't exist.. +``` + +### Использование {#live-view-usage} + +Наиболее частые случаи использования `LIVE-VIEW`: + +- Получение push-уведомлений об изменениях данных без дополнительных периодических запросов. +- Кеширование результатов часто используемых запросов для получения их без задержки. +- Отслеживание изменений таблицы для запуска других запросов `SELECT`. +- Отслеживание показателей из системных таблиц с помощью периодических обновлений. + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/view) diff --git a/docs/ru/sql-reference/statements/watch.md b/docs/ru/sql-reference/statements/watch.md new file mode 100644 index 0000000000000000000000000000000000000000..ef5b2f805841cf8dd5c734f72bff378c803046a8 --- /dev/null +++ b/docs/ru/sql-reference/statements/watch.md @@ -0,0 +1,106 @@ +--- +toc_priority: 53 +toc_title: WATCH +--- + +# Запрос WATCH {#watch} + +!!! important "Важно" + Это экспериментальная функция. Она может повлечь потерю совместимости в будущих версиях. + Чтобы использовать `LIVE VIEW` и запросы `WATCH`, включите настройку `set allow_experimental_live_view = 1`. + +**Синтаксис** + +``` sql +WATCH [db.]live_view [EVENTS] [LIMIT n] [FORMAT format] +``` + +Запрос `WATCH` постоянно возвращает содержимое [LIVE-представления](./create/view.md#live-view). Если параметр `LIMIT` не был задан, запрос `WATCH` будет непрерывно обновлять содержимое [LIVE-представления](./create/view.md#live-view). + +```sql +WATCH [db.]live_view; +``` +## Виртуальные столбцы {#watch-virtual-columns} + +Виртуальный столбец `_version` в результате запроса обозначает версию данного результата. + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv; +``` + +```bash +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 09:17:21 │ 1 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 09:17:26 │ 2 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 09:17:31 │ 3 │ +└─────────────────────┴──────────┘ +... +``` + +По умолчанию запрашиваемые данные возвращаются клиенту, однако в сочетании с запросом [INSERT INTO](../../sql-reference/statements/insert-into.md) они могут быть перенаправлены для вставки в другую таблицу. + +**Пример:** + +```sql +INSERT INTO [db.]table WATCH [db.]live_view ... +``` + +## Секция EVENTS {#events-clause} + +С помощью параметра `EVENTS` можно получить компактную форму результата запроса `WATCH`. Вместо полного результата вы получаете номер последней версии результата. + +```sql +WATCH [db.]live_view EVENTS; +``` + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv EVENTS; +``` + +```bash +┌─version─┐ +│ 1 │ +└─────────┘ +┌─version─┐ +│ 2 │ +└─────────┘ +... +``` + +## Секция LIMIT {#limit-clause} + +Параметр `LIMIT n` задает количество обновлений запроса `WATCH`, после которого отслеживание прекращается. По умолчанию это число не задано, поэтому запрос будет выполняться постоянно. Значение `LIMIT 0` означает, что запрос `WATCH` вернет единственный актуальный результат запроса и прекратит отслеживание. + +```sql +WATCH [db.]live_view LIMIT 1; +``` + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv EVENTS LIMIT 1; +``` + +```bash +┌─version─┐ +│ 1 │ +└─────────┘ +``` + +## Секция FORMAT {#format-clause} + +Параметр `FORMAT` работает аналогично одноименному параметру запроса [SELECT](../../sql-reference/statements/select/format.md#format-clause). + +!!! info "Примечание" + При отслеживании [LIVE VIEW](./create/view.md#live-view) через интерфейс HTTP следует использовать формат [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress). Постоянные сообщения об изменениях будут добавлены в поток вывода для поддержания активности долговременного HTTP-соединения до тех пор, пока результат запроса изменяется. Проомежуток времени между сообщениями об изменениях управляется настройкой[live_view_heartbeat_interval](./create/view.md#live-view-settings). diff --git a/docs/zh/getting-started/index.md b/docs/zh/getting-started/index.md index fdffca954f7e191b2d655eb357f810480d863116..c5ec7ded9324a60cc33a72a4e7c698553ccb4736 100644 --- a/docs/zh/getting-started/index.md +++ b/docs/zh/getting-started/index.md @@ -1,7 +1,5 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "\u5BFC\u8A00" +toc_folder_title: 快速上手 toc_priority: 2 --- @@ -9,7 +7,7 @@ toc_priority: 2 如果您是ClickHouse的新手,并希望亲身体验它的性能。 -首先需要进行 [环境安装与部署](install.md). +首先需要完成 [安装与部署](install.md). 之后,您可以通过教程与示例数据完成自己的入门第一步: diff --git a/docs/zh/introduction/index.md b/docs/zh/introduction/index.md index c8b5a6426d23283471fe52aee11f222b4fbdddcf..64466809d1830ee1745d0a1d46bc0dbf36dd43d5 100644 --- a/docs/zh/introduction/index.md +++ b/docs/zh/introduction/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: 引言 +toc_folder_title: 简介 toc_priority: 1 --- diff --git a/docs/zh/operations/tips.md b/docs/zh/operations/tips.md index 511e8a226447f07556ab6d93746976716a139165..6b46dbb5285e4cf03a5f3447ec3ec21dc6cd8d64 100644 --- a/docs/zh/operations/tips.md +++ b/docs/zh/operations/tips.md @@ -1,24 +1,8 @@ # 使用建议 {#usage-recommendations} -## CPU {#cpu} +## CPU频率调节器 {#cpu-scaling-governor} -必须支持SSE4.2指令集。 现代处理器(自2008年以来)支持它。 - -选择处理器时,与较少的内核和较高的时钟速率相比,更喜欢大量内核和稍慢的时钟速率。 -例如,具有2600MHz的16核心比具有3600MHz的8核心更好。 - -## 超线程 {#hyper-threading} - -不要禁用超线程。 它有助于某些查询,但不适用于其他查询。 - -## 超频 {#turbo-boost} - -强烈推荐超频(turbo-boost)。 它显着提高了典型负载的性能。 -您可以使用 `turbostat` 要查看负载下的CPU的实际时钟速率。 - -## CPU缩放调控器 {#cpu-scaling-governor} - -始终使用 `performance` 缩放调控器。 该 `on-demand` 随着需求的不断增加,缩放调节器的工作要糟糕得多。 +始终使用 `performance` 频率调节器。 `on-demand` 频率调节器在持续高需求的情况下,效果更差。 ``` bash echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor @@ -26,68 +10,70 @@ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_gover ## CPU限制 {#cpu-limitations} -处理器可能会过热。 使用 `dmesg` 看看CPU的时钟速率是否由于过热而受到限制。 -此限制也可以在数据中心级别的外部设置。 您可以使用 `turbostat` 在负载下监视它。 +处理器可能会过热。 使用 `dmesg` 查看CPU的时钟速率是否由于过热而受到限制。 +该限制也可以在数据中心级别外部设置。 您可以使用 `turbostat` 在负载下对其进行监控。 ## RAM {#ram} -对于少量数据(高达-200GB压缩),最好使用与数据量一样多的内存。 -对于大量数据和处理交互式(在线)查询时,应使用合理数量的RAM(128GB或更多),以便热数据子集适合页面缓存。 -即使对于每台服务器约50TB的数据量,使用128GB的RAM与64GB相比显着提高了查询性能。 +对于少量数据(压缩后约200GB),最好使用与数据量一样多的内存。 +对于大量数据,以及在处理交互式(在线)查询时,应使用合理数量的RAM(128GB或更多),以便热数据子集适合页面缓存。 +即使对于每台服务器约50TB的数据量,与64GB相比,使用128GB的RAM也可以显着提高查询性能。 -## 交换文件 {#swap-file} +不要禁用 overcommit。`cat /proc/sys/vm/overcommit_memory` 的值应该为0或1。运行 -始终禁用交换文件。 不这样做的唯一原因是,如果您使用的ClickHouse在您的个人笔记本电脑。 +``` bash +$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory +``` ## 大页(Huge Pages) {#huge-pages} -始终禁用透明大页(transparent huge pages)。 它会干扰内存分alloc,从而导致显着的性能下降。 +始终禁用透明大页(transparent huge pages)。 它会干扰内存分配器,从而导致显着的性能下降。 ``` bash echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` -使用 `perf top` 观察内核中用于内存管理的时间。 +使用 `perf top` 来查看内核在内存管理上花费的时间。 永久大页(permanent huge pages)也不需要被分配。 -## 存储系统 {#storage-subsystem} +## 存储子系统 {#storage-subsystem} 如果您的预算允许您使用SSD,请使用SSD。 如果没有,请使用硬盘。 SATA硬盘7200转就行了。 -优先选择带有本地硬盘驱动器的大量服务器,而不是带有附加磁盘架的小量服务器。 -但是对于存储具有罕见查询的档案,货架将起作用。 +优先选择许多带有本地硬盘驱动器的服务器,而不是少量带有附加磁盘架的服务器。 +但是对于存储极少查询的档案,架子可以使用。 ## RAID {#raid} 当使用硬盘,你可以结合他们的RAID-10,RAID-5,RAID-6或RAID-50。 -对于Linux,软件RAID更好(与 `mdadm`). 我们不建议使用LVM。 +对于Linux,软件RAID更好(使用 `mdadm`). 我们不建议使用LVM。 当创建RAID-10,选择 `far` 布局。 如果您的预算允许,请选择RAID-10。 -如果您有超过4个磁盘,请使用RAID-6(首选)或RAID-50,而不是RAID-5。 +如果您有4个以上的磁盘,请使用RAID-6(首选)或RAID-50,而不是RAID-5。 当使用RAID-5、RAID-6或RAID-50时,始终增加stripe_cache_size,因为默认值通常不是最佳选择。 ``` bash echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` -使用以下公式,从设备数量和块大小计算确切数量: `2 * num_devices * chunk_size_in_bytes / 4096`. +使用以下公式从设备数量和块大小中计算出确切的数量: `2 * num_devices * chunk_size_in_bytes / 4096`。 -1025KB的块大小足以满足所有RAID配置。 +1024KB的块大小足以满足所有RAID配置。 切勿将块大小设置得太小或太大。 您可以在SSD上使用RAID-0。 -无论使用何种RAID,始终使用复制来保证数据安全。 +无论使用哪种RAID,始终使用复制来保证数据安全。 -使用长队列启用NCQ。 对于HDD,选择CFQ调度程序,对于SSD,选择noop。 不要减少 ‘readahead’ 设置。 +启用有长队列的NCQ。 对于HDD,选择CFQ调度程序,对于SSD,选择noop。 不要减少 ‘readahead’ 设置。 对于HDD,启用写入缓存。 ## 文件系统 {#file-system} Ext4是最可靠的选择。 设置挂载选项 `noatime, nobarrier`. -XFS也是合适的,但它还没有经过ClickHouse的彻底测试。 -大多数其他文件系统也应该正常工作。 具有延迟分配的文件系统工作得更好。 +XFS也是合适的,但它还没有经过ClickHouse的全面测试。 +大多数其他文件系统也应该可以正常工作。 具有延迟分配的文件系统工作得更好。 ## Linux内核 {#linux-kernel} @@ -95,26 +81,43 @@ XFS也是合适的,但它还没有经过ClickHouse的彻底测试。 ## 网络 {#network} -如果您使用的是IPv6,请增加路由缓存的大小。 -3.2之前的Linux内核在IPv6实现方面遇到了许多问题。 +如果使用的是IPv6,请增加路由缓存的大小。 +3.2之前的Linux内核在IPv6实现方面存在许多问题。 + +如果可能的话,至少使用10GB的网络。1GB也可以工作,但对于使用数十TB的数据修补副本或处理具有大量中间数据的分布式查询,情况会更糟。 + +## 虚拟机监视器(Hypervisor)配置 + +如果您使用的是OpenStack,请在nova.conf中设置 +``` +cpu_mode=host-passthrough +``` +。 + +如果您使用的是libvirt,请在XML配置中设置 +``` + +``` +。 -如果可能的话,至少使用一个10GB的网络。 1Gb也可以工作,但对于使用数十tb的数据修补副本或处理具有大量中间数据的分布式查询,情况会更糟。 +这对于ClickHouse能够通过 `cpuid` 指令获取正确的信息非常重要。 +否则,当在旧的CPU型号上运行虚拟机监视器时,可能会导致 `Illegal instruction` 崩溃。 ## Zookeeper {#zookeeper} -您可能已经将ZooKeeper用于其他目的。 您可以使用相同的zookeeper安装,如果它还没有超载。 +您可能已经将ZooKeeper用于其他目的。 如果它还没有超载,您可以使用相同的zookeeper。 -最好使用新版本的 Zookeeper – 3.4.9 或之后的版本. 稳定 Liunx 发行版中的 Zookeeper 版本可能是落后的。 +最好使用新版本的Zookeeper – 3.4.9 或更高的版本. 稳定的Liunx发行版中的Zookeeper版本可能已过时。 -你永远不该使用自己手写的脚本在不同的 Zookeeper 集群之间转移数据, 这可能会导致序列节点的数据不正确。出于同样的原因,永远不要使用 zkcopy 工具: https://github.com/ksprojects/zkcopy/issues/15 +你永远不要使用手动编写的脚本在不同的Zookeeper集群之间传输数据, 这可能会导致序列节点的数据不正确。出于相同的原因,永远不要使用 zkcopy 工具: https://github.com/ksprojects/zkcopy/issues/15 -如果要将现有ZooKeeper集群分为两个,正确的方法是增加其副本的数量,然后将其重新配置为两个独立的集群。 +如果要将现有的ZooKeeper集群分为两个,正确的方法是增加其副本的数量,然后将其重新配置为两个独立的集群。 -不要在与ClickHouse相同的服务器上运行ZooKeeper。 因为ZooKeeper对延迟非常敏感,而ClickHouse可能会占用所有可用的系统资源。 +不要在ClickHouse所在的服务器上运行ZooKeeper。 因为ZooKeeper对延迟非常敏感,而ClickHouse可能会占用所有可用的系统资源。 默认设置下,ZooKeeper 就像是一个定时炸弹: -当使用默认配置时,ZooKeeper服务不会从旧快照和日志中删除文件(请参阅autopurge),这是操作员的责任。 +当使用默认配置时,ZooKeeper服务器不会从旧的快照和日志中删除文件(请参阅autopurge),这是操作员的责任。 必须拆除炸弹。 @@ -222,7 +225,7 @@ JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ -XX:+CMSParallelRemarkEnabled" ``` -Salt init: +初始化: description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index 7eea23160b2342ae3219c9bf67c3fd3981475582..2b19a401206162f77a527cb494c8418020e75c54 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -106,7 +106,7 @@ void ClusterCopier::discoverShardPartitions(const ConnectionTimeouts & timeouts, try { - type->deserializeAsTextQuoted(*column_dummy, rb, FormatSettings()); + type->getDefaultSerialization()->deserializeTextQuoted(*column_dummy, rb, FormatSettings()); } catch (Exception & e) { @@ -1719,7 +1719,7 @@ std::set ClusterCopier::getShardPartitions(const ConnectionTimeouts & ti for (size_t i = 0; i < column.column->size(); ++i) { WriteBufferFromOwnString wb; - column.type->serializeAsTextQuoted(*column.column, i, wb, FormatSettings()); + column.type->getDefaultSerialization()->serializeTextQuoted(*column.column, i, wb, FormatSettings()); res.emplace(wb.str()); } } diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 3ccbfd44357166d8aef7f59274b8974c07f6c661..aea70ba0986daa85375708915dda8f778f620d37 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -100,16 +100,16 @@ class IModel { public: /// Call train iteratively for each block to train a model. - virtual void train(const IColumn & column); + virtual void train(const IColumn & column) = 0; /// Call finalize one time after training before generating. - virtual void finalize(); + virtual void finalize() = 0; /// Call generate: pass source data column to obtain a column with anonymized data as a result. - virtual ColumnPtr generate(const IColumn & column); + virtual ColumnPtr generate(const IColumn & column) = 0; /// Deterministically change seed to some other value. This can be used to generate more values than were in source. - virtual void updateSeed(); + virtual void updateSeed() = 0; virtual ~IModel() = default; }; diff --git a/src/AggregateFunctions/AggregateFunctionArgMinMax.h b/src/AggregateFunctions/AggregateFunctionArgMinMax.h index b559c1c8a7e5b8a5c1f6ba8395e61f1fe1fe91f5..9efc907aed3fa57f08ebcc9c5dad48d235c64202 100644 --- a/src/AggregateFunctions/AggregateFunctionArgMinMax.h +++ b/src/AggregateFunctions/AggregateFunctionArgMinMax.h @@ -39,6 +39,8 @@ class AggregateFunctionArgMinMax final : public IAggregateFunctionTupleArgHelper private: const DataTypePtr & type_res; const DataTypePtr & type_val; + const SerializationPtr serialization_res; + const SerializationPtr serialization_val; bool tuple_argument; using Base = IAggregateFunctionTupleArgHelper, 2>; @@ -48,6 +50,8 @@ public: : Base({type_res_, type_val_}, {}, tuple_argument_) , type_res(this->argument_types[0]) , type_val(this->argument_types[1]) + , serialization_res(type_res->getDefaultSerialization()) + , serialization_val(type_val->getDefaultSerialization()) { if (!type_val->isComparable()) throw Exception( @@ -84,14 +88,14 @@ public: void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override { - this->data(place).result.write(buf, *type_res); - this->data(place).value.write(buf, *type_val); + this->data(place).result.write(buf, *serialization_res); + this->data(place).value.write(buf, *serialization_val); } void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena * arena) const override { - this->data(place).result.read(buf, *type_res, arena); - this->data(place).value.read(buf, *type_val, arena); + this->data(place).result.read(buf, *serialization_res, arena); + this->data(place).value.read(buf, *serialization_val, arena); } bool allocatesMemoryInArena() const override { return Data::allocatesMemoryInArena(); } diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h index 42005659a36b914c9e8663e3508d58ebf9bf362e..35913f133b1a266752a196b648a4b1beae2985c3 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h @@ -55,7 +55,8 @@ class AggregateFunctionGroupArrayInsertAtGeneric final : public IAggregateFunctionDataHelper { private: - DataTypePtr & type; + DataTypePtr type; + SerializationPtr serialization; Field default_value; UInt64 length_to_resize = 0; /// zero means - do not do resizing. @@ -63,6 +64,7 @@ public: AggregateFunctionGroupArrayInsertAtGeneric(const DataTypes & arguments, const Array & params) : IAggregateFunctionDataHelper(arguments, params) , type(argument_types[0]) + , serialization(type->getDefaultSerialization()) { if (!params.empty()) { @@ -154,7 +156,7 @@ public: else { writeBinary(UInt8(0), buf); - type->serializeBinary(elem, buf); + serialization->serializeBinary(elem, buf); } } } @@ -175,7 +177,7 @@ public: UInt8 is_null = 0; readBinary(is_null, buf); if (!is_null) - type->deserializeBinary(arr[i], buf); + serialization->deserializeBinary(arr[i], buf); } } diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index a39d9af000becb7d9c0a670533b9e4dfbb11fb9b..919026a78c158f5035aa1dd2f9b3322b46fcf84f 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -50,14 +50,14 @@ public: assert_cast(to).insertDefault(); } - void write(WriteBuffer & buf, const IDataType & /*data_type*/) const + void write(WriteBuffer & buf, const ISerialization & /*serialization*/) const { writeBinary(has(), buf); if (has()) writeBinary(value, buf); } - void read(ReadBuffer & buf, const IDataType & /*data_type*/, Arena *) + void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena *) { readBinary(has_value, buf); if (has()) @@ -221,14 +221,14 @@ public: assert_cast(to).insertDefault(); } - void write(WriteBuffer & buf, const IDataType & /*data_type*/) const + void write(WriteBuffer & buf, const ISerialization & /*serialization*/) const { writeBinary(size, buf); if (has()) buf.write(getData(), size); } - void read(ReadBuffer & buf, const IDataType & /*data_type*/, Arena * arena) + void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena * arena) { Int32 rhs_size; readBinary(rhs_size, buf); @@ -427,24 +427,24 @@ public: to.insertDefault(); } - void write(WriteBuffer & buf, const IDataType & data_type) const + void write(WriteBuffer & buf, const ISerialization & serialization) const { if (!value.isNull()) { writeBinary(true, buf); - data_type.serializeBinary(value, buf); + serialization.serializeBinary(value, buf); } else writeBinary(false, buf); } - void read(ReadBuffer & buf, const IDataType & data_type, Arena *) + void read(ReadBuffer & buf, const ISerialization & serialization, Arena *) { bool is_not_null; readBinary(is_not_null, buf); if (is_not_null) - data_type.deserializeBinary(value, buf); + serialization.deserializeBinary(value, buf); } void change(const IColumn & column, size_t row_num, Arena *) @@ -678,15 +678,15 @@ struct AggregateFunctionAnyHeavyData : Data return false; } - void write(WriteBuffer & buf, const IDataType & data_type) const + void write(WriteBuffer & buf, const ISerialization & serialization) const { - Data::write(buf, data_type); + Data::write(buf, serialization); writeBinary(counter, buf); } - void read(ReadBuffer & buf, const IDataType & data_type, Arena * arena) + void read(ReadBuffer & buf, const ISerialization & serialization, Arena * arena) { - Data::read(buf, data_type, arena); + Data::read(buf, serialization, arena); readBinary(counter, buf); } @@ -698,12 +698,14 @@ template class AggregateFunctionsSingleValue final : public IAggregateFunctionDataHelper> { private: - DataTypePtr & type; + DataTypePtr type; + SerializationPtr serialization; public: AggregateFunctionsSingleValue(const DataTypePtr & type_) : IAggregateFunctionDataHelper>({type_}, {}) , type(this->argument_types[0]) + , serialization(type->getDefaultSerialization()) { if (StringRef(Data::name()) == StringRef("min") || StringRef(Data::name()) == StringRef("max")) @@ -733,12 +735,12 @@ public: void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override { - this->data(place).write(buf, *type.get()); + this->data(place).write(buf, *serialization); } void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena * arena) const override { - this->data(place).read(buf, *type.get(), arena); + this->data(place).read(buf, *serialization, arena); } bool allocatesMemoryInArena() const override diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 9e15ae346b00d5ebb1ce333e42bc2e6f5c414ef9..d8b19bad62f0a14cbb131f8c6fddde7e7eab1ef9 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -64,7 +64,9 @@ class AggregateFunctionMapBase : public IAggregateFunctionDataHelper< { private: DataTypePtr keys_type; + SerializationPtr keys_serialization; DataTypes values_types; + Serializations values_serializations; public: using Base = IAggregateFunctionDataHelper< @@ -72,9 +74,14 @@ public: AggregateFunctionMapBase(const DataTypePtr & keys_type_, const DataTypes & values_types_, const DataTypes & argument_types_) - : Base(argument_types_, {} /* parameters */), keys_type(keys_type_), - values_types(values_types_) + : Base(argument_types_, {} /* parameters */) + , keys_type(keys_type_) + , keys_serialization(keys_type->getDefaultSerialization()) + , values_types(values_types_) { + values_serializations.reserve(values_types.size()); + for (const auto & type : values_types) + values_serializations.emplace_back(type->getDefaultSerialization()); } DataTypePtr getReturnType() const override @@ -248,9 +255,9 @@ public: for (const auto & elem : merged_maps) { - keys_type->serializeBinary(elem.first, buf); + keys_serialization->serializeBinary(elem.first, buf); for (size_t col = 0; col < values_types.size(); ++col) - values_types[col]->serializeBinary(elem.second[col], buf); + values_serializations[col]->serializeBinary(elem.second[col], buf); } } @@ -263,12 +270,12 @@ public: for (size_t i = 0; i < size; ++i) { Field key; - keys_type->deserializeBinary(key, buf); + keys_serialization->deserializeBinary(key, buf); Array values; values.resize(values_types.size()); for (size_t col = 0; col < values_types.size(); ++col) - values_types[col]->deserializeBinary(values[col], buf); + values_serializations[col]->deserializeBinary(values[col], buf); if constexpr (IsDecimalNumber) merged_maps[key.get>()] = values; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b34a64b2d19d9ffbb7bf04f7ca5d1d178ecf6e58..805941f7dccc00f78cc08f4649c4958ec9342e2f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -158,7 +158,11 @@ macro(add_object_library name common_path) list (APPEND all_modules ${name}) add_headers_and_sources(${name} ${common_path}) add_library(${name} SHARED ${${name}_sources} ${${name}_headers}) - target_link_libraries (${name} PRIVATE -Wl,--unresolved-symbols=ignore-all) + if (OS_DARWIN) + target_link_libraries (${name} PRIVATE -Wl,-undefined,dynamic_lookup) + else() + target_link_libraries (${name} PRIVATE -Wl,--unresolved-symbols=ignore-all) + endif() endif () endmacro() @@ -168,6 +172,7 @@ add_object_library(clickhouse_core_mysql Core/MySQL) add_object_library(clickhouse_compression Compression) add_object_library(clickhouse_datastreams DataStreams) add_object_library(clickhouse_datatypes DataTypes) +add_object_library(clickhouse_datatypes_serializations DataTypes/Serializations) add_object_library(clickhouse_databases Databases) add_object_library(clickhouse_databases_mysql Databases/MySQL) add_object_library(clickhouse_disks Disks) @@ -215,7 +220,11 @@ else() target_link_libraries (clickhouse_interpreters PRIVATE clickhouse_parsers_new jemalloc libdivide) list (APPEND all_modules dbms) # force all split libs to be linked - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed") + if (OS_DARWIN) + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-undefined,error") + else() + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed") + endif() endif () macro (dbms_target_include_directories) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 88c92b9fca0097cde29d5958bccb9468bf2b9479..a163ceba4a289bc4657b81a42e643875d11b322a 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -362,9 +362,10 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) epoll_event event; event.data.fd = -1; size_t events_count = 0; + bool blocking = !static_cast(async_callback); while (events_count == 0) { - events_count = epoll.getManyReady(1, &event, false); + events_count = epoll.getManyReady(1, &event, blocking); if (!events_count && async_callback) async_callback(epoll.getFileDescriptor(), 0, epoll.getDescription()); } diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 5267bc5db5d09ca402d370a1bd6530da781cf267..1b0c9f5162feba287611b0b0afe2641c06aab106 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1211,7 +1211,6 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const assert_cast(*temporary_arrays.front()).getOffsetsPtr()); } - void ColumnArray::gather(ColumnGathererStream & gatherer) { gatherer.gather(*this); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index c1eddd539c99809d757833b33427061558476022..fcd0516d465bcc55549cf504ab1705027555caff 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -536,7 +536,6 @@ void ColumnString::getExtremes(Field & min, Field & max) const get(max_idx, max); } - ColumnPtr ColumnString::compress() const { size_t source_chars_size = chars.size(); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 5f570afcdbb183c816b7130dbdc01a0e96bacb25..1792491c60d4b40d444bc712011ef7d5d9fa4ce1 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -279,7 +279,6 @@ public: return typeid(rhs) == typeid(ColumnString); } - Chars & getChars() { return chars; } const Chars & getChars() const { return chars; } diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 1dedd191e1d60a62ae1306be80892d7ef520a603..23acc81e63d34dfc7cb135c7fd57f68606a2b76b 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -26,6 +26,9 @@ class ColumnGathererStream; class Field; class WeakHash32; +class ISerialization; +using SerializationPtr = std::shared_ptr; + /* * Represents a set of equal ranges in previous column to perform sorting in current column. diff --git a/src/Columns/IColumnDummy.h b/src/Columns/IColumnDummy.h index ff405184b7ae2f397eb93d4e90065d407174a135..e844b54aaedf7f0d8bd1b1e571ad54ee628ca24b 100644 --- a/src/Columns/IColumnDummy.h +++ b/src/Columns/IColumnDummy.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index e18bd16e7313b7be8250b6a85c50a137d782b71c..4be04e44fc14614aa3a05384d392846286ad419f 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -130,7 +130,6 @@ TEST(DateLUTTest, TimeValuesInMiddleOfRange) EXPECT_EQ(lut.toRelativeQuarterNum(time), 8078 /*unsigned*/); EXPECT_EQ(lut.toRelativeHourNum(time), 435736 /*time_t*/); EXPECT_EQ(lut.toRelativeMinuteNum(time), 26144180 /*time_t*/); - EXPECT_EQ(lut.toStartOfHourInterval(time, 5), 1568646000 /*time_t*/); EXPECT_EQ(lut.toStartOfMinuteInterval(time, 6), 1568650680 /*time_t*/); EXPECT_EQ(lut.toStartOfSecondInterval(time, 7), 1568650811 /*time_t*/); EXPECT_EQ(lut.toNumYYYYMM(time), 201909 /*UInt32*/); @@ -191,7 +190,6 @@ TEST(DateLUTTest, TimeValuesAtLeftBoderOfRange) EXPECT_EQ(lut.toRelativeQuarterNum(time), 7880 /*unsigned*/); // ? EXPECT_EQ(lut.toRelativeHourNum(time), 0 /*time_t*/); EXPECT_EQ(lut.toRelativeMinuteNum(time), 0 /*time_t*/); - EXPECT_EQ(lut.toStartOfHourInterval(time, 5), 0 /*time_t*/); EXPECT_EQ(lut.toStartOfMinuteInterval(time, 6), 0 /*time_t*/); EXPECT_EQ(lut.toStartOfSecondInterval(time, 7), 0 /*time_t*/); EXPECT_EQ(lut.toNumYYYYMM(time), 197001 /*UInt32*/); @@ -253,7 +251,6 @@ TEST(DateLUTTest, TimeValuesAtRightBoderOfRangeOfOldLUT) EXPECT_EQ(lut.toRelativeQuarterNum(time), 8424 /*unsigned*/); EXPECT_EQ(lut.toRelativeHourNum(time), 1192873 /*time_t*/); EXPECT_EQ(lut.toRelativeMinuteNum(time), 71572397 /*time_t*/); - EXPECT_EQ(lut.toStartOfHourInterval(time, 5), 4294332000 /*time_t*/); EXPECT_EQ(lut.toStartOfMinuteInterval(time, 6), 4294343520 /*time_t*/); EXPECT_EQ(lut.toStartOfSecondInterval(time, 7), 4294343872 /*time_t*/); EXPECT_EQ(lut.toNumYYYYMM(time), 210601 /*UInt32*/); diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index aacf95b195015e47fc7834636247203f5544cefb..0ff9797aeaf1901d5149ff209c76acaec77ca6b5 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -107,9 +107,9 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(const ASTPtr if (column_type) { CompressionCodecPtr prev_codec; - IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & substream_type) + IDataType::StreamCallbackWithType callback = [&](const ISerialization::SubstreamPath & substream_path, const IDataType & substream_type) { - if (IDataType::isSpecialCompressionAllowed(substream_path)) + if (ISerialization::isSpecialCompressionAllowed(substream_path)) { result_codec = getImpl(codec_family_name, codec_arguments, &substream_type); @@ -121,8 +121,8 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(const ASTPtr } }; - IDataType::SubstreamPath stream_path; - column_type->enumerateStreams(callback, stream_path); + ISerialization::SubstreamPath stream_path; + column_type->enumerateStreams(column_type->getDefaultSerialization(), callback, stream_path); if (!result_codec) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName()); diff --git a/src/Core/MySQL/PacketsProtocolText.cpp b/src/Core/MySQL/PacketsProtocolText.cpp index 62efe549b335858aa2060168d89cf6fee9a5bd02..0494a146c471bdaeaa302f329643cccde69a1c40 100644 --- a/src/Core/MySQL/PacketsProtocolText.cpp +++ b/src/Core/MySQL/PacketsProtocolText.cpp @@ -12,7 +12,7 @@ namespace MySQLProtocol namespace ProtocolText { -ResultSetRow::ResultSetRow(const DataTypes & data_types, const Columns & columns_, int row_num_) +ResultSetRow::ResultSetRow(const Serializations & serializations, const Columns & columns_, int row_num_) : columns(columns_), row_num(row_num_) { for (size_t i = 0; i < columns.size(); i++) @@ -25,7 +25,7 @@ ResultSetRow::ResultSetRow(const DataTypes & data_types, const Columns & columns else { WriteBufferFromOwnString ostr; - data_types[i]->serializeAsText(*columns[i], row_num, ostr, FormatSettings()); + serializations[i]->serializeText(*columns[i], row_num, ostr, FormatSettings()); payload_size += getLengthEncodedStringSize(ostr.str()); serialized.push_back(std::move(ostr.str())); } diff --git a/src/Core/MySQL/PacketsProtocolText.h b/src/Core/MySQL/PacketsProtocolText.h index b54b1c5ca195f0e5e7020195059c11d529086972..aeeedc4dbf8cccc4ae112079d0d66ddf591b3943 100644 --- a/src/Core/MySQL/PacketsProtocolText.h +++ b/src/Core/MySQL/PacketsProtocolText.h @@ -76,7 +76,7 @@ protected: void writePayloadImpl(WriteBuffer & buffer) const override; public: - ResultSetRow(const DataTypes & data_types, const Columns & columns_, int row_num_); + ResultSetRow(const Serializations & serializations, const Columns & columns_, int row_num_); }; class ComFieldList : public LimitedReadPacket diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index 7b1779d4346f487cb9bca7d6f1d6ffac04a36cfc..57d29c96c535a2973fa713b83b4ebe227f03eb49 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -22,7 +22,9 @@ NameAndTypePair::NameAndTypePair( : name(name_in_storage_ + (subcolumn_name_.empty() ? "" : "." + subcolumn_name_)) , type(subcolumn_type_) , type_in_storage(type_in_storage_) - , subcolumn_delimiter_position(name_in_storage_.size()) {} + , subcolumn_delimiter_position(subcolumn_name_.empty() ? std::nullopt : std::make_optional(name_in_storage_.size())) +{ +} String NameAndTypePair::getNameInStorage() const { diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 896b56b4d0b21ef8044c29597bfff36735d15a75..2986564b3981a88bae8ddb932be28128399ee42d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -116,6 +116,7 @@ class IColumn; M(UInt64, parallel_distributed_insert_select, 0, "Process distributed INSERT SELECT query in the same cluster on local tables on every shard, if 1 SELECT is executed on each shard, if 2 SELECT and INSERT is executed on each shard", 0) \ M(UInt64, distributed_group_by_no_merge, 0, "If 1, Do not merge aggregation states from different servers for distributed query processing - in case it is for certain that there are different keys on different shards. If 2 - same as 1 but also apply ORDER BY and LIMIT stages", 0) \ M(Bool, optimize_distributed_group_by_sharding_key, false, "Optimize GROUP BY sharding_key queries (by avoiding costly aggregation on the initiator server).", 0) \ + M(UInt64, optimize_skip_unused_shards_limit, 1000, "Limit for number of sharding key values, turns off optimize_skip_unused_shards if the limit is reached", 0) \ M(Bool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \ M(Bool, allow_nondeterministic_optimize_skip_unused_shards, false, "Allow non-deterministic functions (includes dictGet) in sharding_key for optimize_skip_unused_shards", 0) \ M(UInt64, force_optimize_skip_unused_shards, 0, "Throw an exception if unused shards cannot be skipped (1 - throw only if the table has the sharding key, 2 - always throw.", 0) \ @@ -215,7 +216,7 @@ class IColumn; \ M(Bool, insert_distributed_sync, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster.", 0) \ M(UInt64, insert_distributed_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) \ - M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite.", 0) \ + M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. Zero means async mode.", 0) \ M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \ M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \ \ @@ -437,7 +438,9 @@ class IColumn; M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ - M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \ + M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ + M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \ + M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ @@ -449,6 +452,7 @@ class IColumn; M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \ M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \ M(UInt64, query_plan_max_optimizations_to_apply, 10000, "Limit the total number of optimizations applied to query plan. If zero, ignored. If limit reached, throw exception", 0) \ + M(Bool, database_replicated_ddl_output, true, "Obsolete setting, does nothing. Will be removed after 2021-09-08", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below. diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 2e1cf0252561f0a8b30fe3a7483b20e3cdfe289a..64ba51d1c682ef3dd28795cb8abb0797bc7c6c65 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -102,4 +102,10 @@ IMPLEMENT_SETTING_ENUM(UnionMode, ErrorCodes::UNKNOWN_UNION, {"ALL", UnionMode::ALL}, {"DISTINCT", UnionMode::DISTINCT}}) +IMPLEMENT_SETTING_ENUM(DistributedDDLOutputMode, ErrorCodes::BAD_ARGUMENTS, + {{"none", DistributedDDLOutputMode::NONE}, + {"throw", DistributedDDLOutputMode::THROW}, + {"null_status_on_timeout", DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT}, + {"never_throw", DistributedDDLOutputMode::NEVER_THROW}}) + } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index c2ef08135ebed35e84f5de6252546a6ecb4c2ae3..7615b185a6105dc7f62f359e7e3041ac4107805c 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -138,4 +138,15 @@ enum class UnionMode DECLARE_SETTING_ENUM(UnionMode) + +enum class DistributedDDLOutputMode +{ + NONE, + THROW, + NULL_STATUS_ON_TIMEOUT, + NEVER_THROW, +}; + +DECLARE_SETTING_ENUM(DistributedDDLOutputMode) + } diff --git a/src/DataStreams/NativeBlockInputStream.cpp b/src/DataStreams/NativeBlockInputStream.cpp index 377f4451419565bf9dd09dfdc79e802f6b24c976..2f376f5230d297e1fb83c6d88038f2e8da5b063a 100644 --- a/src/DataStreams/NativeBlockInputStream.cpp +++ b/src/DataStreams/NativeBlockInputStream.cpp @@ -73,14 +73,16 @@ void NativeBlockInputStream::resetParser() void NativeBlockInputStream::readData(const IDataType & type, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint) { - IDataType::DeserializeBinaryBulkSettings settings; - settings.getter = [&](IDataType::SubstreamPath) -> ReadBuffer * { return &istr; }; + ISerialization::DeserializeBinaryBulkSettings settings; + settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; settings.avg_value_size_hint = avg_value_size_hint; settings.position_independent_encoding = false; - IDataType::DeserializeBinaryBulkStatePtr state; - type.deserializeBinaryBulkStatePrefix(settings, state); - type.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state); + ISerialization::DeserializeBinaryBulkStatePtr state; + auto serialization = type.getDefaultSerialization(); + + serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); if (column->size() != rows) throw Exception("Cannot read all data in NativeBlockInputStream. Rows read: " + toString(column->size()) + ". Rows expected: " + toString(rows) + ".", diff --git a/src/DataStreams/NativeBlockOutputStream.cpp b/src/DataStreams/NativeBlockOutputStream.cpp index c17d0dacc49f4f10f036c045ebbc20b88c54aba8..da68376201ffa346041c393c7880e1cb8c6f003d 100644 --- a/src/DataStreams/NativeBlockOutputStream.cpp +++ b/src/DataStreams/NativeBlockOutputStream.cpp @@ -48,15 +48,17 @@ void NativeBlockOutputStream::writeData(const IDataType & type, const ColumnPtr */ ColumnPtr full_column = column->convertToFullColumnIfConst(); - IDataType::SerializeBinaryBulkSettings settings; - settings.getter = [&ostr](IDataType::SubstreamPath) -> WriteBuffer * { return &ostr; }; + ISerialization::SerializeBinaryBulkSettings settings; + settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; settings.position_independent_encoding = false; settings.low_cardinality_max_dictionary_size = 0; - IDataType::SerializeBinaryBulkStatePtr state; - type.serializeBinaryBulkStatePrefix(settings, state); - type.serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); - type.serializeBinaryBulkStateSuffix(settings, state); + auto serialization = type.getDefaultSerialization(); + + ISerialization::SerializeBinaryBulkStatePtr state; + serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); + serialization->serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/DataStreams/PostgreSQLBlockInputStream.cpp b/src/DataStreams/PostgreSQLBlockInputStream.cpp index abe635207290dda0dc1bed1fa35d84cf6575e761..478df9e2259e8f2912cc236df1d62555366dc708 100644 --- a/src/DataStreams/PostgreSQLBlockInputStream.cpp +++ b/src/DataStreams/PostgreSQLBlockInputStream.cpp @@ -176,7 +176,7 @@ void PostgreSQLBlockInputStream::insertValue(IColumn & column, std::string_view case ValueType::vtDecimal256: { ReadBufferFromString istr(value); - data_type->deserializeAsWholeText(column, istr, FormatSettings{}); + data_type->getDefaultSerialization()->deserializeWholeText(column, istr, FormatSettings{}); break; } case ValueType::vtArray: diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index 11cc2dcd8e43158f51c0f381dbd783f19e9c27ff..46c2b6f10cbec9089d82691957a79c04f53b58fb 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -104,7 +104,7 @@ void RemoteQueryExecutorReadContext::setConnectionFD(int fd, const Poco::Timespa connection_fd_description = fd_description; } -bool RemoteQueryExecutorReadContext::checkTimeout(bool blocking) const +bool RemoteQueryExecutorReadContext::checkTimeout(bool blocking) { try { @@ -118,7 +118,7 @@ bool RemoteQueryExecutorReadContext::checkTimeout(bool blocking) const } } -bool RemoteQueryExecutorReadContext::checkTimeoutImpl(bool blocking) const +bool RemoteQueryExecutorReadContext::checkTimeoutImpl(bool blocking) { /// Wait for epoll will not block if it was polled externally. epoll_event events[3]; @@ -128,14 +128,13 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl(bool blocking) const bool is_socket_ready = false; bool is_pipe_alarmed = false; - bool has_timer_alarm = false; for (int i = 0; i < num_events; ++i) { if (events[i].data.fd == connection_fd) is_socket_ready = true; if (events[i].data.fd == timer.getDescriptor()) - has_timer_alarm = true; + is_timer_alarmed = true; if (events[i].data.fd == pipe_fd[0]) is_pipe_alarmed = true; } @@ -143,7 +142,7 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl(bool blocking) const if (is_pipe_alarmed) return false; - if (has_timer_alarm && !is_socket_ready) + if (is_timer_alarmed && !is_socket_ready) { /// Socket receive timeout. Drain it in case or error, or it may be hide by timeout exception. timer.drain(); @@ -188,10 +187,18 @@ void RemoteQueryExecutorReadContext::cancel() /// It is safe to just destroy fiber - we are not in the process of reading from socket. boost::context::fiber to_destroy = std::move(fiber); - while (is_read_in_progress.load(std::memory_order_relaxed)) + /// One should not try to wait for the current packet here in case of + /// timeout because this will exceed the timeout. + /// Anyway if the timeout is exceeded, then the connection will be shutdown + /// (disconnected), so it will not left in an unsynchronised state. + if (!is_timer_alarmed) { - checkTimeout(/* blocking= */ true); - to_destroy = std::move(to_destroy).resume(); + /// Wait for current pending packet, to avoid leaving connection in unsynchronised state. + while (is_read_in_progress.load(std::memory_order_relaxed)) + { + checkTimeout(/* blocking= */ true); + to_destroy = std::move(to_destroy).resume(); + } } /// Send something to pipe to cancel executor waiting. diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.h b/src/DataStreams/RemoteQueryExecutorReadContext.h index 5fbe52469cda53cd1442cbd733c8d5d462da75e6..4e935bf9c431c559f83635a2e9f422aa52fe837f 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.h +++ b/src/DataStreams/RemoteQueryExecutorReadContext.h @@ -44,6 +44,7 @@ public: /// * pipe_fd is a pipe we use to cancel query and socket polling by executor. /// We put those descriptors into our own epoll which is used by external executor. TimerDescriptor timer{CLOCK_MONOTONIC, 0}; + bool is_timer_alarmed = false; int connection_fd = -1; int pipe_fd[2] = { -1, -1 }; @@ -54,8 +55,8 @@ public: explicit RemoteQueryExecutorReadContext(IConnections & connections_); ~RemoteQueryExecutorReadContext(); - bool checkTimeout(bool blocking = false) const; - bool checkTimeoutImpl(bool blocking) const; + bool checkTimeout(bool blocking = false); + bool checkTimeoutImpl(bool blocking); void setConnectionFD(int fd, const Poco::Timespan & timeout = 0, const std::string & fd_description = ""); void setTimer() const; diff --git a/src/DataTypes/DataTypeAggregateFunction.cpp b/src/DataTypes/DataTypeAggregateFunction.cpp index e92994ae979e8184aedce685cfb24d36722afe35..7f7b01e031b21d913c04152caa4a0d45295e3d53 100644 --- a/src/DataTypes/DataTypeAggregateFunction.cpp +++ b/src/DataTypes/DataTypeAggregateFunction.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -58,207 +59,6 @@ std::string DataTypeAggregateFunction::doGetName() const return stream.str(); } -void DataTypeAggregateFunction::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const String & s = get(field); - writeVarUInt(s.size(), ostr); - writeString(s, ostr); -} - -void DataTypeAggregateFunction::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - UInt64 size; - readVarUInt(size, istr); - field = String(); - String & s = get(field); - s.resize(size); - istr.readStrict(s.data(), size); -} - -void DataTypeAggregateFunction::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - function->serialize(assert_cast(column).getData()[row_num], ostr); -} - -void DataTypeAggregateFunction::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnAggregateFunction & column_concrete = assert_cast(column); - - Arena & arena = column_concrete.createOrGetArena(); - size_t size_of_state = function->sizeOfData(); - AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); - - function->create(place); - try - { - function->deserialize(place, istr, &arena); - } - catch (...) - { - function->destroy(place); - throw; - } - - column_concrete.getData().push_back(place); -} - -void DataTypeAggregateFunction::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const ColumnAggregateFunction & real_column = typeid_cast(column); - const ColumnAggregateFunction::Container & vec = real_column.getData(); - - ColumnAggregateFunction::Container::const_iterator it = vec.begin() + offset; - ColumnAggregateFunction::Container::const_iterator end = limit ? it + limit : vec.end(); - - if (end > vec.end()) - end = vec.end(); - - for (; it != end; ++it) - function->serialize(*it, ostr); -} - -void DataTypeAggregateFunction::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - ColumnAggregateFunction & real_column = typeid_cast(column); - ColumnAggregateFunction::Container & vec = real_column.getData(); - - Arena & arena = real_column.createOrGetArena(); - real_column.set(function); - vec.reserve(vec.size() + limit); - - size_t size_of_state = function->sizeOfData(); - size_t align_of_state = function->alignOfData(); - - for (size_t i = 0; i < limit; ++i) - { - if (istr.eof()) - break; - - AggregateDataPtr place = arena.alignedAlloc(size_of_state, align_of_state); - - function->create(place); - - try - { - function->deserialize(place, istr, &arena); - } - catch (...) - { - function->destroy(place); - throw; - } - - vec.push_back(place); - } -} - -static String serializeToString(const AggregateFunctionPtr & function, const IColumn & column, size_t row_num) -{ - WriteBufferFromOwnString buffer; - function->serialize(assert_cast(column).getData()[row_num], buffer); - return buffer.str(); -} - -static void deserializeFromString(const AggregateFunctionPtr & function, IColumn & column, const String & s) -{ - ColumnAggregateFunction & column_concrete = assert_cast(column); - - Arena & arena = column_concrete.createOrGetArena(); - size_t size_of_state = function->sizeOfData(); - AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); - - function->create(place); - - try - { - ReadBufferFromString istr(s); - function->deserialize(place, istr, &arena); - } - catch (...) - { - function->destroy(place); - throw; - } - - column_concrete.getData().push_back(place); -} - -void DataTypeAggregateFunction::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeEscapedString(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readEscapedString(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeQuotedString(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readQuotedStringWithSQLStyle(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readStringUntilEOF(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(serializeToString(function, column, row_num), ostr, settings); -} - - -void DataTypeAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readJSONString(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeXMLStringForTextElement(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeCSV(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String s; - readCSV(s, istr, settings.csv); - deserializeFromString(function, column, s); -} - - MutableColumnPtr DataTypeAggregateFunction::createColumn() const { return ColumnAggregateFunction::create(function); @@ -298,6 +98,11 @@ bool DataTypeAggregateFunction::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this) && getName() == rhs.getName(); } +SerializationPtr DataTypeAggregateFunction::doGetDefaultSerialization() const +{ + return std::make_shared(function); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index d07d46fd3ee832b7fc964801a9f9f987453a0ecb..c3fea2ba7272bc71c5db1932b684385fb8407576 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -39,27 +39,6 @@ public: DataTypePtr getReturnTypeToPredict() const { return function->getReturnTypeToPredict(); } DataTypes getArgumentsDataTypes() const { return argument_types; } - /// NOTE These two functions for serializing single values are incompatible with the functions below. - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -69,6 +48,8 @@ public: bool isParametric() const override { return true; } bool haveSubtypes() const override { return false; } bool shouldAlignRightInPrettyFormats() const override { return false; } + + SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 27088ab822c48939d705027d3e20aad761e58f6e..bcf3a9c1f57db96da90b796f08980a75a9214255 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -9,7 +9,9 @@ #include #include #include -#include +#include +#include +#include #include @@ -24,10 +26,7 @@ namespace DB namespace ErrorCodes { - extern const int CANNOT_READ_ALL_DATA; - extern const int CANNOT_READ_ARRAY_FROM_TEXT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int LOGICAL_ERROR; } @@ -37,490 +36,6 @@ DataTypeArray::DataTypeArray(const DataTypePtr & nested_) } -void DataTypeArray::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const Array & a = get(field); - writeVarUInt(a.size(), ostr); - for (size_t i = 0; i < a.size(); ++i) - { - nested->serializeBinary(a[i], ostr); - } -} - - -void DataTypeArray::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - size_t size; - readVarUInt(size, istr); - field = Array(size); - Array & arr = get(field); - for (size_t i = 0; i < size; ++i) - nested->deserializeBinary(arr[i], istr); -} - - -void DataTypeArray::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - size_t size = next_offset - offset; - - writeVarUInt(size, ostr); - - const IColumn & nested_column = column_array.getData(); - for (size_t i = offset; i < next_offset; ++i) - nested->serializeBinary(nested_column, i, ostr); -} - - -void DataTypeArray::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnArray & column_array = assert_cast(column); - ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t size; - readVarUInt(size, istr); - - IColumn & nested_column = column_array.getData(); - - size_t i = 0; - try - { - for (; i < size; ++i) - nested->deserializeBinary(nested_column, istr); - } - catch (...) - { - if (i) - nested_column.popBack(i); - throw; - } - - offsets.push_back(offsets.back() + size); -} - - -namespace -{ - void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) - { - const ColumnArray & column_array = typeid_cast(column); - const ColumnArray::Offsets & offset_values = column_array.getOffsets(); - size_t size = offset_values.size(); - - if (!size) - return; - - size_t end = limit && (offset + limit < size) - ? offset + limit - : size; - - ColumnArray::Offset prev_offset = offset_values[offset - 1]; - for (size_t i = offset; i < end; ++i) - { - ColumnArray::Offset current_offset = offset_values[i]; - writeIntBinary(current_offset - prev_offset, ostr); - prev_offset = current_offset; - } - } - - void deserializeArraySizesPositionIndependent(IColumn & column, ReadBuffer & istr, UInt64 limit) - { - ColumnArray & column_array = typeid_cast(column); - ColumnArray::Offsets & offset_values = column_array.getOffsets(); - size_t initial_size = offset_values.size(); - offset_values.resize(initial_size + limit); - - size_t i = initial_size; - ColumnArray::Offset current_offset = initial_size ? offset_values[initial_size - 1] : 0; - while (i < initial_size + limit && !istr.eof()) - { - ColumnArray::Offset current_size = 0; - readIntBinary(current_size, istr); - current_offset += current_size; - offset_values[i] = current_offset; - ++i; - } - - offset_values.resize(i); - } - - ColumnPtr arrayOffsetsToSizes(const IColumn & column) - { - const auto & column_offsets = assert_cast(column); - MutableColumnPtr column_sizes = column_offsets.cloneEmpty(); - - if (column_offsets.empty()) - return column_sizes; - - const auto & offsets_data = column_offsets.getData(); - auto & sizes_data = assert_cast(*column_sizes).getData(); - - sizes_data.resize(offsets_data.size()); - - IColumn::Offset prev_offset = 0; - for (size_t i = 0, size = offsets_data.size(); i < size; ++i) - { - auto current_offset = offsets_data[i]; - sizes_data[i] = current_offset - prev_offset; - prev_offset = current_offset; - } - - return column_sizes; - } - - ColumnPtr arraySizesToOffsets(const IColumn & column) - { - const auto & column_sizes = assert_cast(column); - MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); - - if (column_sizes.empty()) - return column_offsets; - - const auto & sizes_data = column_sizes.getData(); - auto & offsets_data = assert_cast(*column_offsets).getData(); - - offsets_data.resize(sizes_data.size()); - - IColumn::Offset prev_offset = 0; - for (size_t i = 0, size = sizes_data.size(); i < size; ++i) - { - prev_offset += sizes_data[i]; - offsets_data[i] = prev_offset; - } - - return column_offsets; - } -} - - -void DataTypeArray::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::ArraySizes); - callback(path, *this); - path.back() = Substream::ArrayElements; - nested->enumerateStreams(callback, path); - path.pop_back(); -} - - -void DataTypeArray::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::ArrayElements); - nested->serializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::ArrayElements); - nested->serializeBinaryBulkStateSuffix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::ArrayElements); - nested->deserializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - const ColumnArray & column_array = typeid_cast(column); - - /// First serialize array sizes. - settings.path.push_back(Substream::ArraySizes); - if (auto * stream = settings.getter(settings.path)) - { - if (settings.position_independent_encoding) - serializeArraySizesPositionIndependent(column, *stream, offset, limit); - else - DataTypeNumber().serializeBinaryBulk(*column_array.getOffsetsPtr(), *stream, offset, limit); - } - - /// Then serialize contents of arrays. - settings.path.back() = Substream::ArrayElements; - const ColumnArray::Offsets & offset_values = column_array.getOffsets(); - - if (offset > offset_values.size()) - return; - - /** offset - from which array to write. - * limit - how many arrays should be written, or 0, if you write everything that is. - * end - up to which array the recorded piece ends. - * - * nested_offset - from which element of the innards to write. - * nested_limit - how many elements of the innards to write, or 0, if you write everything that is. - */ - - size_t end = std::min(offset + limit, offset_values.size()); - - size_t nested_offset = offset ? offset_values[offset - 1] : 0; - size_t nested_limit = limit - ? offset_values[end - 1] - nested_offset - : 0; - - if (limit == 0 || nested_limit) - nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), nested_offset, nested_limit, settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - ColumnArray & column_array = typeid_cast(column); - settings.path.push_back(Substream::ArraySizes); - - if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) - { - column_array.getOffsetsPtr() = arraySizesToOffsets(*cached_column); - } - else if (auto * stream = settings.getter(settings.path)) - { - if (settings.position_independent_encoding) - deserializeArraySizesPositionIndependent(column, *stream, limit); - else - DataTypeNumber().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0); - - addToSubstreamsCache(cache, settings.path, arrayOffsetsToSizes(column_array.getOffsetsColumn())); - } - - settings.path.back() = Substream::ArrayElements; - - ColumnArray::Offsets & offset_values = column_array.getOffsets(); - ColumnPtr & nested_column = column_array.getDataPtr(); - - /// Number of values corresponding with `offset_values` must be read. - size_t last_offset = offset_values.back(); - if (last_offset < nested_column->size()) - throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR); - size_t nested_limit = last_offset - nested_column->size(); - - /// Adjust value size hint. Divide it to the average array size. - settings.avg_value_size_hint = nested_limit ? settings.avg_value_size_hint / nested_limit * offset_values.size() : 0; - - nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state, cache); - - settings.path.pop_back(); - - /// Check consistency between offsets and elements subcolumns. - /// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER. - if (!nested_column->empty() && nested_column->size() != last_offset) - throw ParsingException("Cannot read all array values: read just " + toString(nested_column->size()) + " of " + toString(last_offset), - ErrorCodes::CANNOT_READ_ALL_DATA); -} - - -template -static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const IColumn & nested_column = column_array.getData(); - - writeChar('[', ostr); - for (size_t i = offset; i < next_offset; ++i) - { - if (i != offset) - writeChar(',', ostr); - write_nested(nested_column, i); - } - writeChar(']', ostr); -} - - -template -static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) -{ - ColumnArray & column_array = assert_cast(column); - ColumnArray::Offsets & offsets = column_array.getOffsets(); - - IColumn & nested_column = column_array.getData(); - - size_t size = 0; - - bool has_braces = false; - if (checkChar('[', istr)) - has_braces = true; - else if (!allow_unenclosed) - throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); - - try - { - bool first = true; - while (!istr.eof() && *istr.position() != ']') - { - if (!first) - { - if (*istr.position() == ',') - ++istr.position(); - else - throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, - "Cannot read array from text, expected comma or end of array, found '{}'", - *istr.position()); - } - - first = false; - - skipWhitespaceIfAny(istr); - - if (*istr.position() == ']') - break; - - read_nested(nested_column); - ++size; - - skipWhitespaceIfAny(istr); - } - - if (has_braces) - assertChar(']', istr); - else /// If array is not enclosed in braces, we read until EOF. - assertEOF(istr); - } - catch (...) - { - if (size) - nested_column.popBack(size); - throw; - } - - offsets.push_back(offsets.back() + size); -} - - -void DataTypeArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeTextImpl(column, row_num, ostr, - [&](const IColumn & nested_column, size_t i) - { - nested->serializeAsTextQuoted(nested_column, i, ostr, settings); - }); -} - - -void DataTypeArray::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextImpl(column, istr, - [&](IColumn & nested_column) - { - nested->deserializeAsTextQuoted(nested_column, istr, settings); - }, false); -} - -void DataTypeArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const IColumn & nested_column = column_array.getData(); - - writeChar('[', ostr); - for (size_t i = offset; i < next_offset; ++i) - { - if (i != offset) - writeChar(',', ostr); - nested->serializeAsTextJSON(nested_column, i, ostr, settings); - } - writeChar(']', ostr); -} - - -void DataTypeArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextImpl(column, istr, - [&](IColumn & nested_column) - { - nested->deserializeAsTextJSON(nested_column, istr, settings); - }, false); -} - - -void DataTypeArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const IColumn & nested_column = column_array.getData(); - - writeCString("", ostr); - for (size_t i = offset; i < next_offset; ++i) - { - writeCString("", ostr); - nested->serializeAsTextXML(nested_column, i, ostr, settings); - writeCString("", ostr); - } - writeCString("", ostr); -} - - -void DataTypeArray::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - /// There is no good way to serialize an array in CSV. Therefore, we serialize it into a string, and then write the resulting string in CSV. - WriteBufferFromOwnString wb; - serializeText(column, row_num, wb, settings); - writeCSV(wb.str(), ostr); -} - - -void DataTypeArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String s; - readCSV(s, istr, settings.csv); - ReadBufferFromString rb(s); - - if (settings.csv.input_format_arrays_as_nested_csv) - { - deserializeTextImpl(column, rb, - [&](IColumn & nested_column) - { - nested->deserializeAsTextCSV(nested_column, rb, settings); - }, true); - } - else - { - deserializeTextImpl(column, rb, - [&](IColumn & nested_column) - { - nested->deserializeAsTextQuoted(nested_column, rb, settings); - }, true); - } -} - - MutableColumnPtr DataTypeArray::createColumn() const { return ColumnArray::create(nested->createColumn(), ColumnArray::ColumnOffsets::create()); @@ -546,7 +61,7 @@ DataTypePtr DataTypeArray::tryGetSubcolumnType(const String & subcolumn_name) co DataTypePtr DataTypeArray::tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const { if (subcolumn_name == "size" + std::to_string(level)) - return createOneElementTuple(std::make_shared(), subcolumn_name, false); + return std::make_shared(); DataTypePtr subcolumn; if (const auto * nested_array = typeid_cast(nested.get())) @@ -554,7 +69,10 @@ DataTypePtr DataTypeArray::tryGetSubcolumnTypeImpl(const String & subcolumn_name else subcolumn = nested->tryGetSubcolumnType(subcolumn_name); - return (subcolumn ? std::make_shared(std::move(subcolumn)) : subcolumn); + if (subcolumn && subcolumn_name != MAIN_SUBCOLUMN_NAME) + subcolumn = std::make_shared(std::move(subcolumn)); + + return subcolumn; } ColumnPtr DataTypeArray::getSubcolumn(const String & subcolumn_name, const IColumn & column) const @@ -577,6 +95,32 @@ ColumnPtr DataTypeArray::getSubcolumnImpl(const String & subcolumn_name, const I return ColumnArray::create(subcolumn, column_array.getOffsetsPtr()); } +SerializationPtr DataTypeArray::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +{ + return getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, 0); +} + +SerializationPtr DataTypeArray::getSubcolumnSerializationImpl( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter, size_t level) const +{ + if (subcolumn_name == "size" + std::to_string(level)) + return std::make_shared(base_serialization_getter(DataTypeUInt64()), subcolumn_name, false); + + SerializationPtr subcolumn; + if (const auto * nested_array = typeid_cast(nested.get())) + subcolumn = nested_array->getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, level + 1); + else + subcolumn = nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); + + return std::make_shared(subcolumn); +} + +SerializationPtr DataTypeArray::doGetDefaultSerialization() const +{ + return std::make_shared(nested->getDefaultSerialization()); +} + size_t DataTypeArray::getNumberOfDimensions() const { const DataTypeArray * nested_array = typeid_cast(nested.get()); diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 4185163e2e756965452ee8171f79a2b25ad9ca77..c720a15d798e3e9f2ffdaa137588f5ec9994977d 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -1,13 +1,14 @@ #pragma once -#include +#include +#include namespace DB { -class DataTypeArray final : public DataTypeWithSimpleSerialization +class DataTypeArray final : public IDataType { private: /// The type of array elements. @@ -35,56 +36,6 @@ public: return false; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - /** Streaming serialization of arrays is arranged in a special way: - * - elements placed in a row are written/read without array sizes; - * - the sizes are written/read in a separate stream, - * This is necessary, because when implementing nested structures, several arrays can have common sizes. - */ - - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -105,6 +56,10 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + + SerializationPtr doGetDefaultSerialization() const override; const DataTypePtr & getNestedType() const { return nested; } @@ -114,6 +69,8 @@ public: private: ColumnPtr getSubcolumnImpl(const String & subcolumn_name, const IColumn & column, size_t level) const; DataTypePtr tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const; + SerializationPtr getSubcolumnSerializationImpl( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter, size_t level) const; }; } diff --git a/src/DataTypes/DataTypeCustom.h b/src/DataTypes/DataTypeCustom.h index 0fa2e365990de0bf5e540ac4fa213c45509ea759..55796e3cc7a45293761f5f6f9e4bbef49bb80914 100644 --- a/src/DataTypes/DataTypeCustom.h +++ b/src/DataTypes/DataTypeCustom.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { @@ -24,106 +24,20 @@ public: virtual String getName() const = 0; }; -class IDataTypeCustomTextSerialization -{ -public: - virtual ~IDataTypeCustomTextSerialization() {} - - /** Text serialization for displaying on a terminal or saving into a text file, and the like. - * Without escaping or quoting. - */ - virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - - /** Text deserialization without quoting or escaping. - */ - virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization with escaping but without quoting. - */ - virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization as a literal that may be inserted into a query. - */ - virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization for the CSV format. - */ - virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization intended for using in JSON format. - */ - virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization for putting into the XML format. - */ - virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const = 0; -}; - -/** Allows to customize an existing data type by representation with custom substreams. - * Customized data type will be serialized/deserialized to files with different names than base type, - * but binary and text representation will be unchanged. - * E.g it can be used for reading single subcolumns of complex types. - */ -class IDataTypeCustomStreams -{ -public: - virtual ~IDataTypeCustomStreams() = default; - - virtual void enumerateStreams( - const IDataType::StreamCallback & callback, - IDataType::SubstreamPath & path) const = 0; - - virtual void serializeBinaryBulkStatePrefix( - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const = 0; - - virtual void serializeBinaryBulkStateSuffix( - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const = 0; - - virtual void deserializeBinaryBulkStatePrefix( - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state) const = 0; - - virtual void serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const = 0; - - virtual void deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state, - IDataType::SubstreamsCache * cache) const = 0; -}; - using DataTypeCustomNamePtr = std::unique_ptr; -using DataTypeCustomTextSerializationPtr = std::unique_ptr; -using DataTypeCustomStreamsPtr = std::unique_ptr; - /** Describe a data type customization */ struct DataTypeCustomDesc { DataTypeCustomNamePtr name; - DataTypeCustomTextSerializationPtr text_serialization; - DataTypeCustomStreamsPtr streams; + SerializationPtr serialization; DataTypeCustomDesc( DataTypeCustomNamePtr name_, - DataTypeCustomTextSerializationPtr text_serialization_ = nullptr, - DataTypeCustomStreamsPtr streams_ = nullptr) + SerializationPtr serialization_ = nullptr) : name(std::move(name_)) - , text_serialization(std::move(text_serialization_)) - , streams(std::move(streams_)) {} + , serialization(std::move(serialization_)) {} }; using DataTypeCustomDescPtr = std::unique_ptr; diff --git a/src/DataTypes/DataTypeCustomGeo.cpp b/src/DataTypes/DataTypeCustomGeo.cpp index dd29ed21061a177d26f875a580631ed8ef5ab348..f7d05fa3be6bc33beeace173686d6b3afe424a8c 100644 --- a/src/DataTypes/DataTypeCustomGeo.cpp +++ b/src/DataTypes/DataTypeCustomGeo.cpp @@ -1,10 +1,6 @@ #include - -#include -#include #include #include -#include #include #include #include @@ -12,102 +8,20 @@ namespace DB { -namespace -{ - const auto point_data_type = std::make_shared( - DataTypes{std::make_shared(), std::make_shared()} - ); - - const auto ring_data_type = std::make_shared(DataTypeCustomPointSerialization::nestedDataType()); - - const auto polygon_data_type = std::make_shared(DataTypeCustomRingSerialization::nestedDataType()); - - const auto multipolygon_data_type = std::make_shared(DataTypeCustomPolygonSerialization::nestedDataType()); -} - - -void DataTypeCustomPointSerialization::serializeText( - const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - nestedDataType()->serializeAsText(column, row_num, ostr, settings); -} - -void DataTypeCustomPointSerialization::deserializeText( - IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - nestedDataType()->deserializeAsWholeText(column, istr, settings); -} - -DataTypePtr DataTypeCustomPointSerialization::nestedDataType() -{ - return point_data_type; -} - -void DataTypeCustomRingSerialization::serializeText( - const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - nestedDataType()->serializeAsText(column, row_num, ostr, settings); -} - -void DataTypeCustomRingSerialization::deserializeText( - IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - nestedDataType()->deserializeAsWholeText(column, istr, settings); -} - -DataTypePtr DataTypeCustomRingSerialization::nestedDataType() -{ - return ring_data_type; -} - -void DataTypeCustomPolygonSerialization::serializeText( - const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - nestedDataType()->serializeAsText(column, row_num, ostr, settings); -} - -void DataTypeCustomPolygonSerialization::deserializeText( - IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - nestedDataType()->deserializeAsWholeText(column, istr, settings); -} - -DataTypePtr DataTypeCustomPolygonSerialization::nestedDataType() -{ - return polygon_data_type; -} - -void DataTypeCustomMultiPolygonSerialization::serializeText( - const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - nestedDataType()->serializeAsText(column, row_num, ostr, settings); -} - -void DataTypeCustomMultiPolygonSerialization::deserializeText( - IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - nestedDataType()->deserializeAsWholeText(column, istr, settings); -} - -DataTypePtr DataTypeCustomMultiPolygonSerialization::nestedDataType() -{ - return multipolygon_data_type; -} - void registerDataTypeDomainGeo(DataTypeFactory & factory) { // Custom type for point represented as its coordinates stored as Tuple(Float64, Float64) factory.registerSimpleDataTypeCustom("Point", [] { return std::make_pair(DataTypeFactory::instance().get("Tuple(Float64, Float64)"), - std::make_unique(std::make_unique("Point"), std::make_unique())); + std::make_unique(std::make_unique())); }); // Custom type for simple polygon without holes stored as Array(Point) factory.registerSimpleDataTypeCustom("Ring", [] { return std::make_pair(DataTypeFactory::instance().get("Array(Point)"), - std::make_unique(std::make_unique("Ring"), std::make_unique())); + std::make_unique(std::make_unique())); }); // Custom type for polygon with holes stored as Array(Ring) @@ -115,14 +29,14 @@ void registerDataTypeDomainGeo(DataTypeFactory & factory) factory.registerSimpleDataTypeCustom("Polygon", [] { return std::make_pair(DataTypeFactory::instance().get("Array(Ring)"), - std::make_unique(std::make_unique("Polygon"), std::make_unique())); + std::make_unique(std::make_unique())); }); // Custom type for multiple polygons with holes stored as Array(Polygon) factory.registerSimpleDataTypeCustom("MultiPolygon", [] { return std::make_pair(DataTypeFactory::instance().get("Array(Polygon)"), - std::make_unique(std::make_unique("MultiPolygon"), std::make_unique())); + std::make_unique(std::make_unique())); }); } diff --git a/src/DataTypes/DataTypeCustomGeo.h b/src/DataTypes/DataTypeCustomGeo.h index 8f549812b8bbe25faa6d6ccc609fc955e5c45af5..c2a83b3e577c8528ba3e9bb4f29294b1d2c581bb 100644 --- a/src/DataTypes/DataTypeCustomGeo.h +++ b/src/DataTypes/DataTypeCustomGeo.h @@ -1,56 +1,32 @@ #pragma once -#include -#include -#include #include -#include -#include -#include -#include namespace DB { -class DataTypeCustomPointSerialization : public DataTypeCustomSimpleTextSerialization +class DataTypePointName : public DataTypeCustomFixedName { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - static DataTypePtr nestedDataType(); + DataTypePointName() : DataTypeCustomFixedName("Point") {} }; - -class DataTypeCustomRingSerialization : public DataTypeCustomSimpleTextSerialization +class DataTypeRingName : public DataTypeCustomFixedName { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - static DataTypePtr nestedDataType(); + DataTypeRingName() : DataTypeCustomFixedName("Ring") {} }; -class DataTypeCustomPolygonSerialization : public DataTypeCustomSimpleTextSerialization +class DataTypePolygonName : public DataTypeCustomFixedName { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - static DataTypePtr nestedDataType(); + DataTypePolygonName() : DataTypeCustomFixedName("Polygon") {} }; -class DataTypeCustomMultiPolygonSerialization : public DataTypeCustomSimpleTextSerialization +class DataTypeMultiPolygonName : public DataTypeCustomFixedName { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - static DataTypePtr nestedDataType(); + DataTypeMultiPolygonName() : DataTypeCustomFixedName("MultiPolygon") {} }; } diff --git a/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp b/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp index 78a1e18679d4c8383a1f9f9500e188ff8f72fcf6..808aa43528ef6b3b8bb7e8c1544c5685774b6ad5 100644 --- a/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp +++ b/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp @@ -1,115 +1,24 @@ -#include -#include -#include -#include +#include #include #include -#include namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; - extern const int CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING; -} - -namespace -{ - -class DataTypeCustomIPv4Serialization : public DataTypeCustomSimpleTextSerialization -{ -public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - const auto * col = checkAndGetColumn(&column); - if (!col) - { - throw Exception("IPv4 type can only serialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; - char * ptr = buffer; - formatIPv4(reinterpret_cast(&col->getData()[row_num]), ptr); - - ostr.write(buffer, strlen(buffer)); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - ColumnUInt32 * col = typeid_cast(&column); - if (!col) - { - throw Exception("IPv4 type can only deserialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; - istr.read(buffer, sizeof(buffer) - 1); - UInt32 ipv4_value = 0; - if (!parseIPv4(buffer, reinterpret_cast(&ipv4_value))) - { - throw Exception("Invalid IPv4 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - } - - col->insert(ipv4_value); - } -}; - -class DataTypeCustomIPv6Serialization : public DataTypeCustomSimpleTextSerialization -{ -public: - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - const auto * col = checkAndGetColumn(&column); - if (!col) - { - throw Exception("IPv6 type domain can only serialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; - char * ptr = buffer; - formatIPv6(reinterpret_cast(col->getDataAt(row_num).data), ptr); - - ostr.write(buffer, strlen(buffer)); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - ColumnFixedString * col = typeid_cast(&column); - if (!col) - { - throw Exception("IPv6 type domain can only deserialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; - istr.read(buffer, sizeof(buffer) - 1); - - std::string ipv6_value(IPV6_BINARY_LENGTH, '\0'); - if (!parseIPv6(buffer, reinterpret_cast(ipv6_value.data()))) - { - throw Exception("Invalid IPv6 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - } - - col->insertString(ipv6_value); - } -}; - -} - void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory) { factory.registerSimpleDataTypeCustom("IPv4", [] { - return std::make_pair(DataTypeFactory::instance().get("UInt32"), - std::make_unique(std::make_unique("IPv4"), std::make_unique())); + auto type = DataTypeFactory::instance().get("UInt32"); + return std::make_pair(type, std::make_unique( + std::make_unique("IPv4"), std::make_unique(type->getDefaultSerialization()))); }); factory.registerSimpleDataTypeCustom("IPv6", [] { - return std::make_pair(DataTypeFactory::instance().get("FixedString(16)"), - std::make_unique(std::make_unique("IPv6"), std::make_unique())); + auto type = DataTypeFactory::instance().get("FixedString(16)"); + return std::make_pair(type, std::make_unique( + std::make_unique("IPv6"), std::make_unique(type->getDefaultSerialization()))); }); /// MySQL, MariaDB diff --git a/src/DataTypes/DataTypeCustomSimpleTextSerialization.cpp b/src/DataTypes/DataTypeCustomSimpleTextSerialization.cpp deleted file mode 100644 index 5bb963de6671202f441204a26f39398fefcc152b..0000000000000000000000000000000000000000 --- a/src/DataTypes/DataTypeCustomSimpleTextSerialization.cpp +++ /dev/null @@ -1,91 +0,0 @@ -#include - -#include -#include -#include -#include - -namespace -{ -using namespace DB; - -String serializeToString(const DataTypeCustomSimpleTextSerialization & domain, const IColumn & column, size_t row_num, const FormatSettings & settings) -{ - WriteBufferFromOwnString buffer; - domain.serializeText(column, row_num, buffer, settings); - - return buffer.str(); -} - -void deserializeFromString(const DataTypeCustomSimpleTextSerialization & domain, IColumn & column, const String & s, const FormatSettings & settings) -{ - ReadBufferFromString istr(s); - domain.deserializeText(column, istr, settings); -} - -} - -namespace DB -{ - -void DataTypeCustomSimpleTextSerialization::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readEscapedString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readQuotedString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeCSVString(serializeToString(*this, column, row_num, settings), ostr); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readCSVString(str, istr, settings.csv); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readJSONString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); -} - -} diff --git a/src/DataTypes/DataTypeCustom_fwd.h b/src/DataTypes/DataTypeCustom_fwd.h deleted file mode 100644 index 99c8eee9748b2c2c412d6d7c4c60ca4d67ccf709..0000000000000000000000000000000000000000 --- a/src/DataTypes/DataTypeCustom_fwd.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class IDataTypeCustomName; -class IDataTypeCustomTextSerialization; -class IDataTypeCustomStreams; -struct DataTypeCustomDesc; - -using DataTypeCustomNamePtr = std::unique_ptr; -using DataTypeCustomTextSerializationPtr = std::unique_ptr; -using DataTypeCustomStreamsPtr = std::unique_ptr; -using DataTypeCustomDescPtr = std::unique_ptr; - -} diff --git a/src/DataTypes/DataTypeDate.cpp b/src/DataTypes/DataTypeDate.cpp index 192a89cc454ad76f0b4739469fab1647b5a8317b..0df2e329702437e8c15060a2ffc39d494212a57a 100644 --- a/src/DataTypes/DataTypeDate.cpp +++ b/src/DataTypes/DataTypeDate.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -11,79 +12,15 @@ namespace DB { -void DataTypeDate::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeDateText(DayNum(assert_cast(column).getData()[row_num]), ostr); -} - -void DataTypeDate::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings); -} - -void DataTypeDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DayNum x; - readDateText(x, istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -void DataTypeDate::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeDate::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DayNum x; - assertChar('\'', istr); - readDateText(x, istr); - assertChar('\'', istr); - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DayNum x; - assertChar('"', istr); - readDateText(x, istr); - assertChar('"', istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - LocalDate value; - readCSV(value, istr); - assert_cast(column).getData().push_back(value.getDayNum()); -} - bool DataTypeDate::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeDate::doGetDefaultSerialization() const +{ + return std::make_shared(); +} void registerDataTypeDate(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 496d7fe0b22f422960c0b519256c204de5c2bc6c..2f17207cc07677a02efc48dab2c0d93cbde93eaf 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -14,21 +14,13 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date; } const char * getFamilyName() const override { return family_name; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } bool equals(const IDataType & rhs) const override; + +protected: + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeDateTime.cpp b/src/DataTypes/DataTypeDateTime.cpp index 510747f6ef976bb30f760048b11b0197358640ba..820bfd70766831e7b3ac7c2b0208da88a246f29a 100644 --- a/src/DataTypes/DataTypeDateTime.cpp +++ b/src/DataTypes/DataTypeDateTime.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -15,26 +16,6 @@ namespace DB { -namespace -{ - -inline void readTextHelper( - time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) -{ - switch (settings.date_time_input_format) - { - case FormatSettings::DateTimeInputFormat::Basic: - readDateTimeText(x, istr, time_zone); - return; - case FormatSettings::DateTimeInputFormat::BestEffort: - parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); - return; - } -} - -} - - TimezoneMixin::TimezoneMixin(const String & time_zone_name) : has_explicit_time_zone(!time_zone_name.empty()), time_zone(DateLUT::instance(time_zone_name)), @@ -62,124 +43,6 @@ String DataTypeDateTime::doGetName() const return out.str(); } -void DataTypeDateTime::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - auto value = assert_cast(column).getData()[row_num]; - switch (settings.date_time_output_format) - { - case FormatSettings::DateTimeOutputFormat::Simple: - writeDateTimeText(value, ostr, time_zone); - return; - case FormatSettings::DateTimeOutputFormat::UnixTimestamp: - writeIntText(value, ostr); - return; - case FormatSettings::DateTimeOutputFormat::ISO: - writeDateTimeTextISO(value, ostr, utc_time_zone); - return; - } -} - -void DataTypeDateTime::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -void DataTypeDateTime::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings); -} - -void DataTypeDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x = 0; - readTextHelper(x, istr, settings, time_zone, utc_time_zone); - if (x < 0) - x = 0; - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x = 0; - - if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' - { - readTextHelper(x, istr, settings, time_zone, utc_time_zone); - assertChar('\'', istr); - } - else /// Just 1504193808 or 01504193808 - { - readIntText(x, istr); - } - if (x < 0) - x = 0; - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x = 0; - - if (checkChar('"', istr)) - { - readTextHelper(x, istr, settings, time_zone, utc_time_zone); - assertChar('"', istr); - } - else - { - readIntText(x, istr); - } - - if (x < 0) - x = 0; - - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x = 0; - - if (istr.eof()) - throwReadAfterEOF(); - - char maybe_quote = *istr.position(); - - if (maybe_quote == '\'' || maybe_quote == '\"') - ++istr.position(); - - readTextHelper(x, istr, settings, time_zone, utc_time_zone); - - if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, istr); - - if (x < 0) - x = 0; - - assert_cast(column).getData().push_back(x); -} - bool DataTypeDateTime::equals(const IDataType & rhs) const { /// DateTime with different timezones are equal, because: @@ -187,4 +50,9 @@ bool DataTypeDateTime::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeDateTime::doGetDefaultSerialization() const +{ + return std::make_shared(time_zone, utc_time_zone); +} + } diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index edec889309b3a97446fcf623eb8c276ef90d7572..84df8b218130c2455212ea79b1b21b64a8a8ba42 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -58,21 +58,12 @@ public: String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::DateTime; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } bool equals(const IDataType & rhs) const override; + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeDateTime64.cpp b/src/DataTypes/DataTypeDateTime64.cpp index 17b94e871bf2637d8087bcedf46d11845929eaa2..eaec585b6b4ec941687934c63ee03334d00664b2 100644 --- a/src/DataTypes/DataTypeDateTime64.cpp +++ b/src/DataTypes/DataTypeDateTime64.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -55,131 +56,6 @@ std::string DataTypeDateTime64::doGetName() const return out.str(); } -void DataTypeDateTime64::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - auto value = assert_cast(column).getData()[row_num]; - switch (settings.date_time_output_format) - { - case FormatSettings::DateTimeOutputFormat::Simple: - writeDateTimeText(value, scale, ostr, time_zone); - return; - case FormatSettings::DateTimeOutputFormat::UnixTimestamp: - writeDateTimeUnixTimestamp(value, scale, ostr); - return; - case FormatSettings::DateTimeOutputFormat::ISO: - writeDateTimeTextISO(value, scale, ostr, utc_time_zone); - return; - } -} - -void DataTypeDateTime64::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DateTime64 result = 0; - readDateTime64Text(result, this->getScale(), istr, time_zone); - assert_cast(column).getData().push_back(result); -} - -void DataTypeDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings); -} - -void DataTypeDateTime64::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) -{ - switch (settings.date_time_input_format) - { - case FormatSettings::DateTimeInputFormat::Basic: - readDateTime64Text(x, scale, istr, time_zone); - return; - case FormatSettings::DateTimeInputFormat::BestEffort: - parseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); - return; - } -} - -void DataTypeDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - readText(x, scale, istr, settings, time_zone, utc_time_zone); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' - { - readText(x, scale, istr, settings, time_zone, utc_time_zone); - assertChar('\'', istr); - } - else /// Just 1504193808 or 01504193808 - { - readIntText(x, istr); - } - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - if (checkChar('"', istr)) - { - readText(x, scale, istr, settings, time_zone, utc_time_zone); - assertChar('"', istr); - } - else - { - readIntText(x, istr); - } - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - - if (istr.eof()) - throwReadAfterEOF(); - - char maybe_quote = *istr.position(); - - if (maybe_quote == '\'' || maybe_quote == '\"') - ++istr.position(); - - readText(x, scale, istr, settings, time_zone, utc_time_zone); - - if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, istr); - - assert_cast(column).getData().push_back(x); -} - bool DataTypeDateTime64::equals(const IDataType & rhs) const { if (const auto * ptype = typeid_cast(&rhs)) @@ -187,4 +63,9 @@ bool DataTypeDateTime64::equals(const IDataType & rhs) const return false; } +SerializationPtr DataTypeDateTime64::doGetDefaultSerialization() const +{ + return std::make_shared(time_zone, utc_time_zone, scale); +} + } diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index ec3f2fde8898f4ae5a72e6b6a64b7546e1b79b1b..f51e0f5d047221c496a7f874ff75ce43035f2a27 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -31,21 +31,12 @@ public: std::string doGetName() const override; TypeIndex getTypeId() const override { return type_id; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool equals(const IDataType & rhs) const override; bool canBePromoted() const override { return false; } + +protected: + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeDecimalBase.cpp b/src/DataTypes/DataTypeDecimalBase.cpp index ab17996167c8a275656d215fab4e2a5f91e12aac..5f64fca6704949e7b977ce8f45efbf611013faf7 100644 --- a/src/DataTypes/DataTypeDecimalBase.cpp +++ b/src/DataTypes/DataTypeDecimalBase.cpp @@ -35,59 +35,6 @@ MutableColumnPtr DataTypeDecimalBase::createColumn() const return ColumnType::create(0, scale); } -template -void DataTypeDecimalBase::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - FieldType x = get>(field); - writeBinary(x, ostr); -} - -template -void DataTypeDecimalBase::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const FieldType & x = assert_cast(column).getElement(row_num); - writeBinary(x, ostr); -} - -template -void DataTypeDecimalBase::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const typename ColumnType::Container & x = typeid_cast(column).getData(); - - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - ostr.write(reinterpret_cast(&x[offset]), sizeof(FieldType) * limit); -} - -template -void DataTypeDecimalBase::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - typename FieldType::NativeType x; - readBinary(x, istr); - field = DecimalField(T(x), this->scale); -} - -template -void DataTypeDecimalBase::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - typename FieldType::NativeType x; - readBinary(x, istr); - assert_cast(column).getData().push_back(FieldType(x)); -} - -template -void DataTypeDecimalBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double) const -{ - typename ColumnType::Container & x = typeid_cast(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(FieldType) * limit); - x.resize(initial_size + size / sizeof(FieldType)); -} - template T DataTypeDecimalBase::getScaleMultiplier(UInt32 scale_) { diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h index d9079166fa7ef778cb5a80cc5712dc64298a4945..85cb9fa83638dc10ef4913d84c226ec567126849 100644 --- a/src/DataTypes/DataTypeDecimalBase.h +++ b/src/DataTypes/DataTypeDecimalBase.h @@ -5,7 +5,6 @@ #include #include #include -#include #include @@ -55,7 +54,7 @@ inline UInt32 leastDecimalPrecisionFor(TypeIndex int_type) /// P is one of (9, 18, 38, 76); equals to the maximum precision for the biggest underlying type of operands. /// S is maximum scale of operands. The allowed valuas are [0, precision] template -class DataTypeDecimalBase : public DataTypeWithSimpleSerialization +class DataTypeDecimalBase : public IDataType { static_assert(IsDecimalNumber); @@ -96,14 +95,6 @@ public: bool canBeUsedInBooleanContext() const override { return true; } bool canBeInsideNullable() const override { return true; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - /// Decimal specific UInt32 getPrecision() const { return precision; } diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index a0669092ba7677def96353076353af40591cae89..b8b0b906cc49cf6645370d4cb57e31c10bad5242 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -19,7 +20,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_TYPE_OF_FIELD; - extern const int SYNTAX_ERROR; extern const int EMPTY_DATA_PASSED; extern const int UNEXPECTED_AST_STRUCTURE; extern const int ARGUMENT_OUT_OF_BOUND; @@ -65,203 +65,22 @@ std::string DataTypeEnum::generateName(const Values & values) } template -void DataTypeEnum::fillMaps() +DataTypeEnum::DataTypeEnum(const Values & values_) + : EnumValues(values_) + , type_name(generateName(this->getValues())) { - for (const auto & name_and_value : values) - { - const auto inserted_value = name_to_value_map.insert( - { StringRef{name_and_value.first}, name_and_value.second }); - - if (!inserted_value.second) - throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and " + toString(inserted_value.first->getMapped()), - ErrorCodes::SYNTAX_ERROR}; - - const auto inserted_name = value_to_name_map.insert( - { name_and_value.second, StringRef{name_and_value.first} }); - - if (!inserted_name.second) - throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and '" + toString((*inserted_name.first).first) + "'", - ErrorCodes::SYNTAX_ERROR}; - } -} - -template -DataTypeEnum::DataTypeEnum(const Values & values_) : values{values_} -{ - if (values.empty()) - throw Exception{"DataTypeEnum enumeration cannot be empty", ErrorCodes::EMPTY_DATA_PASSED}; - - std::sort(std::begin(values), std::end(values), [] (auto & left, auto & right) - { - return left.second < right.second; - }); - - fillMaps(); - type_name = generateName(values); -} - -template -void DataTypeEnum::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const FieldType x = get(field); - writeBinary(x, ostr); -} - -template -void DataTypeEnum::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - FieldType x; - readBinary(x, istr); - field = castToNearestFieldType(x); -} - -template -void DataTypeEnum::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - writeBinary(assert_cast(column).getData()[row_num], ostr); -} - -template -void DataTypeEnum::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - typename ColumnType::ValueType x; - readBinary(x, istr); - assert_cast(column).getData().push_back(x); -} - -template -void DataTypeEnum::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeEscapedString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.tsv.input_format_enum_as_number) - assert_cast(column).getData().push_back(readValue(istr)); - else - { - /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. - std::string field_name; - readEscapedString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); - } -} - -template -void DataTypeEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeQuotedString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - std::string field_name; - readQuotedStringWithSQLStyle(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); -} - -template -void DataTypeEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.tsv.input_format_enum_as_number) - assert_cast(column).getData().push_back(readValue(istr)); - else - { - std::string field_name; - readString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); - } -} - -template -void DataTypeEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(getNameForValue(assert_cast(column).getData()[row_num]), ostr, settings); -} - -template -void DataTypeEnum::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeXMLStringForTextElement(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - if (!istr.eof() && *istr.position() != '"') - assert_cast(column).getData().push_back(readValue(istr)); - else - { - std::string field_name; - readJSONString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); - } -} - -template -void DataTypeEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeCSVString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.csv.input_format_enum_as_number) - assert_cast(column).getData().push_back(readValue(istr)); - else - { - std::string field_name; - readCSVString(field_name, istr, settings.csv); - assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); - } -} - -template -void DataTypeEnum::serializeBinaryBulk( - const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const -{ - const auto & x = typeid_cast(column).getData(); - const auto size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - ostr.write(reinterpret_cast(&x[offset]), sizeof(FieldType) * limit); -} - -template -void DataTypeEnum::deserializeBinaryBulk( - IColumn & column, ReadBuffer & istr, const size_t limit, const double /*avg_value_size_hint*/) const -{ - auto & x = typeid_cast(column).getData(); - const auto initial_size = x.size(); - x.resize(initial_size + limit); - const auto size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(FieldType) * limit); - x.resize(initial_size + size / sizeof(FieldType)); } template Field DataTypeEnum::getDefault() const { - return values.front().second; + return this->getValues().front().second; } template void DataTypeEnum::insertDefaultInto(IColumn & column) const { - assert_cast(column).getData().push_back(values.front().second); + assert_cast(column).getData().push_back(this->getValues().front().second); } template @@ -274,7 +93,7 @@ bool DataTypeEnum::equals(const IDataType & rhs) const template bool DataTypeEnum::textCanContainOnlyValidUTF8() const { - for (const auto & elem : values) + for (const auto & elem : this->getValues()) { const char * pos = elem.first.data(); const char * end = pos + elem.first.size(); @@ -305,14 +124,14 @@ Field DataTypeEnum::castToName(const Field & value_or_name) const { if (value_or_name.getType() == Field::Types::String) { - getValue(value_or_name.get()); /// Check correctness + this->getValue(value_or_name.get()); /// Check correctness return value_or_name.get(); } else if (value_or_name.getType() == Field::Types::Int64) { Int64 value = value_or_name.get(); checkOverflow(value); - return getNameForValue(static_cast(value)).toString(); + return this->getNameForValue(static_cast(value)).toString(); } else throw Exception(String("DataTypeEnum: Unsupported type of field ") + value_or_name.getTypeName(), ErrorCodes::BAD_TYPE_OF_FIELD); @@ -323,14 +142,14 @@ Field DataTypeEnum::castToValue(const Field & value_or_name) const { if (value_or_name.getType() == Field::Types::String) { - return getValue(value_or_name.get()); + return this->getValue(value_or_name.get()); } else if (value_or_name.getType() == Field::Types::Int64 || value_or_name.getType() == Field::Types::UInt64) { Int64 value = value_or_name.get(); checkOverflow(value); - getNameForValue(static_cast(value)); /// Check correctness + this->getNameForValue(static_cast(value)); /// Check correctness return value; } else @@ -341,25 +160,19 @@ Field DataTypeEnum::castToValue(const Field & value_or_name) const template bool DataTypeEnum::contains(const IDataType & rhs) const { - auto check = [&](const auto & value) - { - auto it = name_to_value_map.find(value.first); - /// If we don't have this name, than we have to be sure, - /// that this value exists in enum - if (it == name_to_value_map.end()) - return value_to_name_map.count(value.second) > 0; - - /// If we have this name, than it should have the same value - return it->value.second == value.second; - }; - if (const auto * rhs_enum8 = typeid_cast(&rhs)) - return std::all_of(rhs_enum8->getValues().begin(), rhs_enum8->getValues().end(), check); + return this->containsAll(rhs_enum8->getValues()); if (const auto * rhs_enum16 = typeid_cast(&rhs)) - return std::all_of(rhs_enum16->getValues().begin(), rhs_enum16->getValues().end(), check); + return this->containsAll(rhs_enum16->getValues()); return false; } +template +SerializationPtr DataTypeEnum::doGetDefaultSerialization() const +{ + return std::make_shared>(this->getValues()); +} + /// Explicit instantiations. template class DataTypeEnum; diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 003613edb98e54af5c5b4d636c0e17a2b9c576f1..57657d1d11046fa0411ddbbb2e287f8a6fc896bb 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -11,12 +12,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - - class IDataTypeEnum : public IDataType { public: @@ -36,102 +31,37 @@ public: template -class DataTypeEnum final : public IDataTypeEnum +class DataTypeEnum final : public IDataTypeEnum, public EnumValues { public: using FieldType = Type; using ColumnType = ColumnVector; - using Value = std::pair; - using Values = std::vector; - using NameToValueMap = HashMap; - using ValueToNameMap = std::unordered_map; + using typename EnumValues::Values; static constexpr bool is_parametric = true; private: - Values values; - NameToValueMap name_to_value_map; - ValueToNameMap value_to_name_map; std::string type_name; - static std::string generateName(const Values & values); - void fillMaps(); public: explicit DataTypeEnum(const Values & values_); - const Values & getValues() const { return values; } std::string doGetName() const override { return type_name; } const char * getFamilyName() const override; TypeIndex getTypeId() const override { return sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; } - auto findByValue(const FieldType & value) const - { - const auto it = value_to_name_map.find(value); - if (it == std::end(value_to_name_map)) - throw Exception{"Unexpected value " + toString(value) + " for type " + getName(), ErrorCodes::BAD_ARGUMENTS}; - - return it; - } - - const StringRef & getNameForValue(const FieldType & value) const - { - return findByValue(value)->second; - } - - FieldType getValue(StringRef field_name, bool try_treat_as_id = false) const - { - const auto it = name_to_value_map.find(field_name); - if (!it) - { - /// It is used in CSV and TSV input formats. If we fail to find given string in - /// enum names, we will try to treat it as enum id. - if (try_treat_as_id) - { - FieldType x; - ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); - readText(x, tmp_buf); - /// Check if we reached end of the tmp_buf (otherwise field_name is not a number) - /// and try to find it in enum ids - if (tmp_buf.eof() && value_to_name_map.find(x) != value_to_name_map.end()) - return x; - } - throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::BAD_ARGUMENTS}; - } - return it->getMapped(); - } - FieldType readValue(ReadBuffer & istr) const { FieldType x; readText(x, istr); - return findByValue(x)->first; + return this->findByValue(x)->first; } Field castToName(const Field & value_or_name) const override; Field castToValue(const Field & value_or_name) const override; - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, const size_t limit, const double avg_value_size_hint) const override; - MutableColumnPtr createColumn() const override { return ColumnType::create(); } Field getDefault() const override; @@ -147,6 +77,8 @@ public: /// Enum('a' = 1, 'b' = 2) -> Enum('c' = 1, 'b' = 2, 'd' = 3) OK /// Enum('a' = 1, 'b' = 2) -> Enum('a' = 2, 'b' = 1) NOT OK bool contains(const IDataType & rhs) const; + + SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index 618c1f510679f28180f8fbf9b7886f3f3df14cfb..9fa3e30297b27f1dcd7cae97351b4403943c15e7 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include @@ -86,6 +86,5 @@ void registerDataTypeLowCardinality(DataTypeFactory & factory); void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); -void registerDataTypeOneElementTuple(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeFixedString.cpp b/src/DataTypes/DataTypeFixedString.cpp index 87e989d1dd2b7229b2e0ca3fc9ecd18497651717..a40592ba023e3c0b5c9c78d8c235e28a01281e1a 100644 --- a/src/DataTypes/DataTypeFixedString.cpp +++ b/src/DataTypes/DataTypeFixedString.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -22,10 +23,8 @@ namespace DB namespace ErrorCodes { - extern const int CANNOT_READ_ALL_DATA; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int UNEXPECTED_AST_STRUCTURE; - extern const int TOO_LARGE_STRING_SIZE; } @@ -34,184 +33,6 @@ std::string DataTypeFixedString::doGetName() const return "FixedString(" + toString(n) + ")"; } - -void DataTypeFixedString::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const String & s = get(field); - ostr.write(s.data(), std::min(s.size(), n)); - if (s.size() < n) - for (size_t i = s.size(); i < n; ++i) - ostr.write(0); -} - - -void DataTypeFixedString::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - field = String(); - String & s = get(field); - s.resize(n); - istr.readStrict(s.data(), n); -} - - -void DataTypeFixedString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - ostr.write(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n); -} - - -void DataTypeFixedString::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnFixedString::Chars & data = assert_cast(column).getChars(); - size_t old_size = data.size(); - data.resize(old_size + n); - try - { - istr.readStrict(reinterpret_cast(data.data() + old_size), n); - } - catch (...) - { - data.resize_assume_reserved(old_size); - throw; - } -} - - -void DataTypeFixedString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const ColumnFixedString::Chars & data = typeid_cast(column).getChars(); - - size_t size = data.size() / n; - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&data[n * offset]), n * limit); -} - - -void DataTypeFixedString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - ColumnFixedString::Chars & data = typeid_cast(column).getChars(); - - size_t initial_size = data.size(); - size_t max_bytes = limit * n; - data.resize(initial_size + max_bytes); - size_t read_bytes = istr.readBig(reinterpret_cast(&data[initial_size]), max_bytes); - - if (read_bytes % n != 0) - throw Exception("Cannot read all data of type FixedString. Bytes read:" + toString(read_bytes) + ". String size:" + toString(n) + ".", - ErrorCodes::CANNOT_READ_ALL_DATA); - - data.resize(initial_size + read_bytes); -} - - -void DataTypeFixedString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n, ostr); -} - - -void DataTypeFixedString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeAnyEscapedString<'\''>(pos, pos + n, ostr); -} - - -void DataTypeFixedString::alignStringLength(PaddedPODArray & chars, size_t old_size) const -{ - size_t length = chars.size() - old_size; - if (length < n) - { - chars.resize_fill(old_size + n); - } - else if (length > n) - { - chars.resize_assume_reserved(old_size); - throw Exception("Too large value for FixedString(" + std::to_string(n) + ")", ErrorCodes::TOO_LARGE_STRING_SIZE); - } -} - - -template -static inline void read(const DataTypeFixedString & self, IColumn & column, Reader && reader) -{ - ColumnFixedString::Chars & data = typeid_cast(column).getChars(); - size_t prev_size = data.size(); - try - { - reader(data); - self.alignStringLength(data, prev_size); - } - catch (...) - { - data.resize_assume_reserved(prev_size); - throw; - } -} - - -void DataTypeFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); -} - - -void DataTypeFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeAnyQuotedString<'\''>(pos, pos + n, ostr); -} - - -void DataTypeFixedString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); -} - - -void DataTypeFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); }); -} - - -void DataTypeFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeJSONString(pos, pos + n, ostr, settings); -} - - -void DataTypeFixedString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); -} - - -void DataTypeFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeXMLStringForTextElement(pos, pos + n, ostr); -} - - -void DataTypeFixedString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeCSVString(pos, pos + n, ostr); -} - - -void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); -} - - MutableColumnPtr DataTypeFixedString::createColumn() const { return ColumnFixedString::create(n); @@ -227,6 +48,11 @@ bool DataTypeFixedString::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this) && n == static_cast(rhs).n; } +SerializationPtr DataTypeFixedString::doGetDefaultSerialization() const +{ + return std::make_shared(n); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index 5c80a0e346ac09c4c93291fd4459b1e3cf80f666..d82ea9824f359258cee669265b44a1fdd4d2179e 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -41,38 +41,14 @@ public: return n; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; bool equals(const IDataType & rhs) const override; + SerializationPtr doGetDefaultSerialization() const override; + bool isParametric() const override { return true; } bool haveSubtypes() const override { return false; } bool isComparable() const override { return true; } diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index 1b21b7de4bcc18f2af53a03c2499b229a8957b83..485083d67ee3c65b2d4765b700b280b7dfc706cd 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace DB @@ -25,19 +26,6 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } -namespace -{ - const ColumnLowCardinality & getColumnLowCardinality(const IColumn & column) - { - return typeid_cast(column); - } - - ColumnLowCardinality & getColumnLowCardinality(IColumn & column) - { - return typeid_cast(column); - } -} - DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) : dictionary_type(std::move(dictionary_type_)) { @@ -50,785 +38,6 @@ DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) + dictionary_type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } -void DataTypeLowCardinality::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::DictionaryKeys); - dictionary_type->enumerateStreams(callback, path); - path.back() = Substream::DictionaryIndexes; - callback(path, *this); - path.pop_back(); -} - -struct KeysSerializationVersion -{ - enum Value - { - /// Version is written at the start of . - /// Dictionary is written as number N and N keys after them. - /// Dictionary can be shared for continuous range of granules, so some marks may point to the same position. - /// Shared dictionary is stored in state and is read once. - SharedDictionariesWithAdditionalKeys = 1, - }; - - Value value; - - static void checkVersion(UInt64 version) - { - if (version != SharedDictionariesWithAdditionalKeys) - throw Exception("Invalid version for DataTypeLowCardinality key column.", ErrorCodes::LOGICAL_ERROR); - } - - explicit KeysSerializationVersion(UInt64 version) : value(static_cast(version)) { checkVersion(version); } -}; - -/// Version is stored at the start of each granule. It's used to store indexes type and flags. -struct IndexesSerializationType -{ - using SerializationType = UInt64; - /// Need to read dictionary if it wasn't. - static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u; - /// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them. - static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u; - /// Need to update dictionary. It means that previous granule has different dictionary. - static constexpr SerializationType NeedUpdateDictionary = 1u << 10u; - - enum Type - { - TUInt8 = 0, - TUInt16, - TUInt32, - TUInt64, - }; - - Type type; - bool has_additional_keys; - bool need_global_dictionary; - bool need_update_dictionary; - - static constexpr SerializationType resetFlags(SerializationType type) - { - return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary)); - } - - static void checkType(SerializationType type) - { - UInt64 value = resetFlags(type); - if (value <= TUInt64) - return; - - throw Exception("Invalid type for DataTypeLowCardinality index column.", ErrorCodes::LOGICAL_ERROR); - } - - void serialize(WriteBuffer & buffer) const - { - SerializationType val = type; - if (has_additional_keys) - val |= HasAdditionalKeysBit; - if (need_global_dictionary) - val |= NeedGlobalDictionaryBit; - if (need_update_dictionary) - val |= NeedUpdateDictionary; - writeIntBinary(val, buffer); - } - - void deserialize(ReadBuffer & buffer) - { - SerializationType val; - readIntBinary(val, buffer); - checkType(val); - has_additional_keys = (val & HasAdditionalKeysBit) != 0; - need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0; - need_update_dictionary = (val & NeedUpdateDictionary) != 0; - type = static_cast(resetFlags(val)); - } - - IndexesSerializationType(const IColumn & column, - bool has_additional_keys_, - bool need_global_dictionary_, - bool enumerate_dictionaries) - : has_additional_keys(has_additional_keys_) - , need_global_dictionary(need_global_dictionary_) - , need_update_dictionary(enumerate_dictionaries) - { - if (typeid_cast(&column)) - type = TUInt8; - else if (typeid_cast(&column)) - type = TUInt16; - else if (typeid_cast(&column)) - type = TUInt32; - else if (typeid_cast(&column)) - type = TUInt64; - else - throw Exception("Invalid Indexes column for IndexesSerializationType. Expected ColumnUInt*, got " - + column.getName(), ErrorCodes::LOGICAL_ERROR); - } - - DataTypePtr getDataType() const - { - if (type == TUInt8) - return std::make_shared(); - if (type == TUInt16) - return std::make_shared(); - if (type == TUInt32) - return std::make_shared(); - if (type == TUInt64) - return std::make_shared(); - - throw Exception("Can't create DataType from IndexesSerializationType.", ErrorCodes::LOGICAL_ERROR); - } - - IndexesSerializationType() = default; -}; - -struct SerializeStateLowCardinality : public IDataType::SerializeBinaryBulkState -{ - KeysSerializationVersion key_version; - MutableColumnUniquePtr shared_dictionary; - - explicit SerializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} -}; - -struct DeserializeStateLowCardinality : public IDataType::DeserializeBinaryBulkState -{ - KeysSerializationVersion key_version; - ColumnUniquePtr global_dictionary; - - IndexesSerializationType index_type; - ColumnPtr additional_keys; - ColumnPtr null_map; - UInt64 num_pending_rows = 0; - - /// If dictionary should be updated. - /// Can happen is some granules was skipped while reading from MergeTree. - /// We should store this flag in State because - /// in case of long block of empty arrays we may not need read dictionary at first reading. - bool need_update_dictionary = false; - - explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} -}; - -static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState( - IDataType::SerializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeLowCardinality.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = typeid_cast(state.get()); - if (!low_cardinality_state) - { - auto & state_ref = *state; - throw Exception("Invalid SerializeBinaryBulkState for DataTypeLowCardinality. Expected: " - + demangle(typeid(SerializeStateLowCardinality).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return low_cardinality_state; -} - -static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState( - IDataType::DeserializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeLowCardinality.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = typeid_cast(state.get()); - if (!low_cardinality_state) - { - auto & state_ref = *state; - throw Exception("Invalid DeserializeBinaryBulkState for DataTypeLowCardinality. Expected: " - + demangle(typeid(DeserializeStateLowCardinality).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return low_cardinality_state; -} - -void DataTypeLowCardinality::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::DictionaryKeys); - auto * stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!stream) - throw Exception("Got empty stream in DataTypeLowCardinality::serializeBinaryBulkStatePrefix", - ErrorCodes::LOGICAL_ERROR); - - /// Write version and create SerializeBinaryBulkState. - UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys; - - writeIntBinary(key_version, *stream); - - state = std::make_shared(key_version); -} - -void DataTypeLowCardinality::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); - KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); - - if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size) - { - auto nested_column = low_cardinality_state->shared_dictionary->getNestedNotNullableColumn(); - - settings.path.push_back(Substream::DictionaryKeys); - auto * stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!stream) - throw Exception("Got empty stream in DataTypeLowCardinality::serializeBinaryBulkStateSuffix", - ErrorCodes::LOGICAL_ERROR); - - UInt64 num_keys = nested_column->size(); - writeIntBinary(num_keys, *stream); - removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *stream, 0, num_keys); - low_cardinality_state->shared_dictionary = nullptr; - } -} - -void DataTypeLowCardinality::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::DictionaryKeys); - auto * stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!stream) - return; - - UInt64 keys_version; - readIntBinary(keys_version, *stream); - - state = std::make_shared(keys_version); -} - -namespace -{ - template - PaddedPODArray * getIndexesData(IColumn & indexes) - { - auto * column = typeid_cast *>(&indexes); - if (column) - return &column->getData(); - - return nullptr; - } - - struct IndexMapsWithAdditionalKeys - { - MutableColumnPtr dictionary_map; - MutableColumnPtr additional_keys_map; - }; - - template - IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeysRef(PaddedPODArray & index, size_t dict_size) - { - PaddedPODArray copy(index.cbegin(), index.cend()); - - HashMap dict_map; - HashMap add_keys_map; - - for (auto val : index) - { - if (val < dict_size) - dict_map.insert({val, dict_map.size()}); - else - add_keys_map.insert({val, add_keys_map.size()}); - } - - auto dictionary_map = ColumnVector::create(dict_map.size()); - auto additional_keys_map = ColumnVector::create(add_keys_map.size()); - auto & dict_data = dictionary_map->getData(); - auto & add_keys_data = additional_keys_map->getData(); - - for (auto val : dict_map) - dict_data[val.second] = val.first; - - for (auto val : add_keys_map) - add_keys_data[val.second] = val.first - dict_size; - - for (auto & val : index) - val = val < dict_size ? dict_map[val] - : add_keys_map[val] + dict_map.size(); - - for (size_t i = 0; i < index.size(); ++i) - { - T expected = index[i] < dict_data.size() ? dict_data[index[i]] - : add_keys_data[index[i] - dict_data.size()] + dict_size; - if (expected != copy[i]) - throw Exception("Expected " + toString(expected) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR); - - } - - return {std::move(dictionary_map), std::move(additional_keys_map)}; - } - - template - IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray & index, size_t dict_size) - { - T max_less_dict_size = 0; - T max_value = 0; - - auto size = index.size(); - if (size == 0) - return {ColumnVector::create(), ColumnVector::create()}; - - for (size_t i = 0; i < size; ++i) - { - auto val = index[i]; - if (val < dict_size) - max_less_dict_size = std::max(max_less_dict_size, val); - - max_value = std::max(max_value, val); - } - - auto map_size = UInt64(max_less_dict_size) + 1; - auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0; - PaddedPODArray map(map_size, 0); - PaddedPODArray overflow_map(overflow_map_size, 0); - - T zero_pos_value = 0; - T zero_pos_overflowed_value = 0; - UInt64 cur_pos = 0; - UInt64 cur_overflowed_pos = 0; - - for (size_t i = 0; i < size; ++i) - { - T val = index[i]; - if (val < dict_size) - { - if (cur_pos == 0) - { - zero_pos_value = val; - ++cur_pos; - } - else if (map[val] == 0 && val != zero_pos_value) - { - map[val] = cur_pos; - ++cur_pos; - } - } - else - { - T shifted_val = val - dict_size; - if (cur_overflowed_pos == 0) - { - zero_pos_overflowed_value = shifted_val; - ++cur_overflowed_pos; - } - else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value) - { - overflow_map[shifted_val] = cur_overflowed_pos; - ++cur_overflowed_pos; - } - } - } - - auto dictionary_map = ColumnVector::create(cur_pos); - auto additional_keys_map = ColumnVector::create(cur_overflowed_pos); - auto & dict_data = dictionary_map->getData(); - auto & add_keys_data = additional_keys_map->getData(); - - for (size_t i = 0; i < map_size; ++i) - if (map[i]) - dict_data[map[i]] = static_cast(i); - - for (size_t i = 0; i < overflow_map_size; ++i) - if (overflow_map[i]) - add_keys_data[overflow_map[i]] = static_cast(i); - - if (cur_pos) - dict_data[0] = zero_pos_value; - if (cur_overflowed_pos) - add_keys_data[0] = zero_pos_overflowed_value; - - for (size_t i = 0; i < size; ++i) - { - T & val = index[i]; - if (val < dict_size) - val = map[val]; - else - val = overflow_map[val - dict_size] + cur_pos; - } - - return {std::move(dictionary_map), std::move(additional_keys_map)}; - } - - /// Update column and return map with old indexes. - /// Let N is the number of distinct values which are less than max_size; - /// old_column - column before function call; - /// new_column - column after function call: - /// * if old_column[i] < max_size, than - /// dictionary_map[new_column[i]] = old_column[i] - /// * else - /// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N - IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size) - { - if (auto * data_uint8 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint8, dict_size); - else if (auto * data_uint16 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint16, dict_size); - else if (auto * data_uint32 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint32, dict_size); - else if (auto * data_uint64 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint64, dict_size); - else - throw Exception("Indexes column for mapIndexWithAdditionalKeys must be UInt, got" + column.getName(), - ErrorCodes::LOGICAL_ERROR); - } -} - -void DataTypeLowCardinality::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::DictionaryKeys); - auto * keys_stream = settings.getter(settings.path); - settings.path.back() = Substream::DictionaryIndexes; - auto * indexes_stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!keys_stream && !indexes_stream) - return; - - if (!keys_stream) - throw Exception("Got empty stream for DataTypeLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); - - if (!indexes_stream) - throw Exception("Got empty stream for DataTypeLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); - - const ColumnLowCardinality & low_cardinality_column = typeid_cast(column); - - auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); - auto & global_dictionary = low_cardinality_state->shared_dictionary; - KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); - - bool need_update_dictionary = global_dictionary == nullptr; - if (need_update_dictionary) - global_dictionary = createColumnUnique(*dictionary_type); - - size_t max_limit = column.size() - offset; - limit = limit ? std::min(limit, max_limit) : max_limit; - - /// Do not write anything for empty column. (May happen while writing empty arrays.) - if (limit == 0) - return; - - auto sub_column = low_cardinality_column.cutAndCompact(offset, limit); - ColumnPtr positions = sub_column->getIndexesPtr(); - ColumnPtr keys = sub_column->getDictionary().getNestedColumn(); - - if (settings.low_cardinality_max_dictionary_size) - { - /// Insert used_keys into global dictionary and update sub_index. - auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(), - settings.low_cardinality_max_dictionary_size); - size_t max_size = settings.low_cardinality_max_dictionary_size + indexes_with_overflow.overflowed_keys->size(); - ColumnLowCardinality::Index(indexes_with_overflow.indexes->getPtr()).check(max_size); - - if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size) - throw Exception("Got dictionary with size " + toString(global_dictionary->size()) + - " but max dictionary size is " + toString(settings.low_cardinality_max_dictionary_size), - ErrorCodes::LOGICAL_ERROR); - - positions = indexes_with_overflow.indexes->index(*positions, 0); - keys = std::move(indexes_with_overflow.overflowed_keys); - - if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty()) - throw Exception("Has additional keys, but dict size is " + toString(global_dictionary->size()) + - " which is less then max dictionary size (" + toString(settings.low_cardinality_max_dictionary_size) + ")", - ErrorCodes::LOGICAL_ERROR); - } - - if (const auto * nullable_keys = checkAndGetColumn(*keys)) - keys = nullable_keys->getNestedColumnPtr(); - - bool need_additional_keys = !keys->empty(); - bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0; - bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part - && global_dictionary->size() >= settings.low_cardinality_max_dictionary_size; - - IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary); - index_version.serialize(*indexes_stream); - - if (need_write_dictionary) - { - const auto & nested_column = global_dictionary->getNestedNotNullableColumn(); - UInt64 num_keys = nested_column->size(); - writeIntBinary(num_keys, *keys_stream); - removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys); - low_cardinality_state->shared_dictionary = nullptr; - } - - if (need_additional_keys) - { - UInt64 num_keys = keys->size(); - writeIntBinary(num_keys, *indexes_stream); - removeNullable(dictionary_type)->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys); - } - - UInt64 num_rows = positions->size(); - writeIntBinary(num_rows, *indexes_stream); - index_version.getDataType()->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows); -} - -void DataTypeLowCardinality::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * /* cache */) const -{ - ColumnLowCardinality & low_cardinality_column = typeid_cast(column); - - settings.path.push_back(Substream::DictionaryKeys); - auto * keys_stream = settings.getter(settings.path); - settings.path.back() = Substream::DictionaryIndexes; - auto * indexes_stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!keys_stream && !indexes_stream) - return; - - if (!keys_stream) - throw Exception("Got empty stream for DataTypeLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); - - if (!indexes_stream) - throw Exception("Got empty stream for DataTypeLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state); - KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); - - auto read_dictionary = [this, low_cardinality_state, keys_stream]() - { - UInt64 num_keys; - readIntBinary(num_keys, *keys_stream); - - auto keys_type = removeNullable(dictionary_type); - auto global_dict_keys = keys_type->createColumn(); - keys_type->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0); - - auto column_unique = createColumnUnique(*dictionary_type, std::move(global_dict_keys)); - low_cardinality_state->global_dictionary = std::move(column_unique); - }; - - auto read_additional_keys = [this, low_cardinality_state, indexes_stream]() - { - UInt64 num_keys; - readIntBinary(num_keys, *indexes_stream); - auto keys_type = removeNullable(dictionary_type); - auto additional_keys = keys_type->createColumn(); - keys_type->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0); - low_cardinality_state->additional_keys = std::move(additional_keys); - - if (!low_cardinality_state->index_type.need_global_dictionary && dictionary_type->isNullable()) - { - auto null_map = ColumnUInt8::create(num_keys, 0); - if (num_keys) - null_map->getElement(0) = 1; - - low_cardinality_state->null_map = std::move(null_map); - } - }; - - auto read_indexes = [this, low_cardinality_state, indexes_stream, &low_cardinality_column](UInt64 num_rows) - { - auto indexes_type = low_cardinality_state->index_type.getDataType(); - MutableColumnPtr indexes_column = indexes_type->createColumn(); - indexes_type->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0); - - auto & global_dictionary = low_cardinality_state->global_dictionary; - const auto & additional_keys = low_cardinality_state->additional_keys; - - bool has_additional_keys = low_cardinality_state->index_type.has_additional_keys; - bool column_is_empty = low_cardinality_column.empty(); - - if (!low_cardinality_state->index_type.need_global_dictionary) - { - ColumnPtr keys_column = additional_keys; - if (low_cardinality_state->null_map) - keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map); - low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*keys_column, *indexes_column); - } - else if (!has_additional_keys) - { - if (column_is_empty) - low_cardinality_column.setSharedDictionary(global_dictionary); - - auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column)); - low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows); - } - else - { - auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size()); - - ColumnLowCardinality::Index(maps.additional_keys_map->getPtr()).check(additional_keys->size()); - - ColumnLowCardinality::Index(indexes_column->getPtr()).check( - maps.dictionary_map->size() + maps.additional_keys_map->size()); - - auto used_keys = IColumn::mutate(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0)); - - if (!maps.additional_keys_map->empty()) - { - auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0); - - if (dictionary_type->isNullable()) - { - ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0); - used_add_keys = ColumnNullable::create(used_add_keys, null_map); - } - - used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size()); - } - - low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column); - } - }; - - if (!settings.continuous_reading) - { - low_cardinality_state->num_pending_rows = 0; - - /// Remember in state that some granules were skipped and we need to update dictionary. - low_cardinality_state->need_update_dictionary = true; - } - - while (limit) - { - if (low_cardinality_state->num_pending_rows == 0) - { - if (indexes_stream->eof()) - break; - - auto & index_type = low_cardinality_state->index_type; - auto & global_dictionary = low_cardinality_state->global_dictionary; - - index_type.deserialize(*indexes_stream); - - bool need_update_dictionary = - !global_dictionary || index_type.need_update_dictionary || low_cardinality_state->need_update_dictionary; - if (index_type.need_global_dictionary && need_update_dictionary) - { - read_dictionary(); - low_cardinality_state->need_update_dictionary = false; - } - - if (low_cardinality_state->index_type.has_additional_keys) - read_additional_keys(); - else - low_cardinality_state->additional_keys = nullptr; - - readIntBinary(low_cardinality_state->num_pending_rows, *indexes_stream); - } - - size_t num_rows_to_read = std::min(limit, low_cardinality_state->num_pending_rows); - read_indexes(num_rows_to_read); - limit -= num_rows_to_read; - low_cardinality_state->num_pending_rows -= num_rows_to_read; - } -} - -void DataTypeLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - dictionary_type->serializeBinary(field, ostr); -} -void DataTypeLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - dictionary_type->deserializeBinary(field, istr); -} - -void DataTypeLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - serializeImpl(column, row_num, &IDataType::serializeBinary, ostr); -} -void DataTypeLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - deserializeImpl(column, &IDataType::deserializeBinary, istr); -} - -void DataTypeLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextEscaped, ostr, settings); -} - -void DataTypeLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings); -} - -void DataTypeLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextQuoted, ostr, settings); -} - -void DataTypeLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextQuoted, istr, settings); -} - -void DataTypeLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsWholeText, istr, settings); -} - -void DataTypeLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextCSV, ostr, settings); -} - -void DataTypeLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextCSV, istr, settings); -} - -void DataTypeLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsText, ostr, settings); -} - -void DataTypeLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextJSON, ostr, settings); -} -void DataTypeLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextJSON, istr, settings); -} - -void DataTypeLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextXML, ostr, settings); -} - -template -void DataTypeLowCardinality::serializeImpl( - const IColumn & column, size_t row_num, DataTypeLowCardinality::SerializeFunctionPtr func, Args &&... args) const -{ - const auto & low_cardinality_column = getColumnLowCardinality(column); - size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(row_num); - (dictionary_type.get()->*func)(*low_cardinality_column.getDictionary().getNestedColumn(), unique_row_number, std::forward(args)...); -} - -template -void DataTypeLowCardinality::deserializeImpl( - IColumn & column, DataTypeLowCardinality::DeserializeFunctionPtr func, Args &&... args) const -{ - auto & low_cardinality_column= getColumnLowCardinality(column); - auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); - - (dictionary_type.get()->*func)(*temp_column, std::forward(args)...); - - low_cardinality_column.insertFromFullColumn(*temp_column, 0); -} - namespace { template @@ -927,6 +136,11 @@ bool DataTypeLowCardinality::equals(const IDataType & rhs) const return dictionary_type->equals(*low_cardinality_rhs.dictionary_type); } +SerializationPtr DataTypeLowCardinality::doGetDefaultSerialization() const +{ + return std::make_shared(dictionary_type); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index f5b6b5711873d301e0946607b018c8b33a1e587b..1266174c6d6afe24761c03a82b675d26bbefd5c7 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -24,50 +24,6 @@ public: const char * getFamilyName() const override { return "LowCardinality"; } TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -100,6 +56,7 @@ public: static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); private: + SerializationPtr doGetDefaultSerialization() const override; template using SerializeFunctionPtr = void (IDataType::*)(const IColumn &, size_t, Params ...) const; diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 0fe479ae373487fc4be1fb7e187b7b2225b2fc20..1d58076136257f8b9889d4b580ab8c267c231ead 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,6 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int CANNOT_READ_MAP_FROM_TEXT; extern const int BAD_ARGUMENTS; } @@ -75,11 +75,6 @@ static const IColumn & extractNestedColumn(const IColumn & column) return assert_cast(column).getNestedColumn(); } -static IColumn & extractNestedColumn(IColumn & column) -{ - return assert_cast(column).getNestedColumn(); -} - DataTypePtr DataTypeMap::tryGetSubcolumnType(const String & subcolumn_name) const { return nested->tryGetSubcolumnType(subcolumn_name); @@ -90,265 +85,10 @@ ColumnPtr DataTypeMap::getSubcolumn(const String & subcolumn_name, const IColumn return nested->getSubcolumn(subcolumn_name, extractNestedColumn(column)); } -void DataTypeMap::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const auto & map = get(field); - writeVarUInt(map.size(), ostr); - for (const auto & elem : map) - { - const auto & tuple = elem.safeGet(); - assert(tuple.size() == 2); - key_type->serializeBinary(tuple[0], ostr); - value_type->serializeBinary(tuple[1], ostr); - } -} - -void DataTypeMap::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - size_t size; - readVarUInt(size, istr); - field = Map(size); - for (auto & elem : field.get()) - { - Tuple tuple(2); - key_type->deserializeBinary(tuple[0], istr); - value_type->deserializeBinary(tuple[1], istr); - elem = std::move(tuple); - } -} - -void DataTypeMap::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - nested->serializeBinary(extractNestedColumn(column), row_num, ostr); -} - -void DataTypeMap::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - nested->deserializeBinary(extractNestedColumn(column), istr); -} - - -template -void DataTypeMap::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const -{ - const auto & column_map = assert_cast(column); - - const auto & nested_array = column_map.getNestedColumn(); - const auto & nested_tuple = column_map.getNestedData(); - const auto & offsets = nested_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - writeChar('{', ostr); - for (size_t i = offset; i < next_offset; ++i) - { - if (i != offset) - writeChar(',', ostr); - writer(key_type, nested_tuple.getColumn(0), i); - writeChar(':', ostr); - writer(value_type, nested_tuple.getColumn(1), i); - } - writeChar('}', ostr); -} - -template -void DataTypeMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, bool need_safe_get_int_key, Reader && reader) const -{ - auto & column_map = assert_cast(column); - - auto & nested_array = column_map.getNestedColumn(); - auto & nested_tuple = column_map.getNestedData(); - auto & offsets = nested_array.getOffsets(); - - auto & key_column = nested_tuple.getColumn(0); - auto & value_column = nested_tuple.getColumn(1); - - size_t size = 0; - assertChar('{', istr); - - try - { - bool first = true; - while (!istr.eof() && *istr.position() != '}') - { - if (!first) - { - if (*istr.position() == ',') - ++istr.position(); - else - throw Exception("Cannot read Map from text", ErrorCodes::CANNOT_READ_MAP_FROM_TEXT); - } - - first = false; - - skipWhitespaceIfAny(istr); - - if (*istr.position() == '}') - break; - - if (need_safe_get_int_key) - { - ReadBuffer::Position tmp = istr.position(); - while (*tmp != ':' && *tmp != '}') - ++tmp; - *tmp = ' '; - reader(key_type, key_column); - } - else - { - reader(key_type, key_column); - skipWhitespaceIfAny(istr); - assertChar(':', istr); - } - - ++size; - skipWhitespaceIfAny(istr); - reader(value_type, value_column); - - skipWhitespaceIfAny(istr); - } - - offsets.push_back(offsets.back() + size); - assertChar('}', istr); - } - catch (...) - { - throw; - } -} - -void DataTypeMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeTextImpl(column, row_num, ostr, - [&](const DataTypePtr & subcolumn_type, const IColumn & subcolumn, size_t pos) - { - subcolumn_type->serializeAsTextQuoted(subcolumn, pos, ostr, settings); - }); -} - -void DataTypeMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - // need_safe_get_int_key is set for Integer to prevent to readIntTextUnsafe - bool need_safe_get_int_key = isInteger(key_type); - - deserializeTextImpl(column, istr, need_safe_get_int_key, - [&](const DataTypePtr & subcolumn_type, IColumn & subcolumn) - { - subcolumn_type->deserializeAsTextQuoted(subcolumn, istr, settings); - }); -} - - -void DataTypeMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeTextImpl(column, row_num, ostr, - [&](const DataTypePtr & subcolumn_type, const IColumn & subcolumn, size_t pos) - { - subcolumn_type->serializeAsTextJSON(subcolumn, pos, ostr, settings); - }); -} - -void DataTypeMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - // need_safe_get_int_key is set for Integer to prevent to readIntTextUnsafe - bool need_safe_get_int_key = isInteger(key_type); - - deserializeTextImpl(column, istr, need_safe_get_int_key, - [&](const DataTypePtr & subcolumn_type, IColumn & subcolumn) - { - subcolumn_type->deserializeAsTextJSON(subcolumn, istr, settings); - }); -} - -void DataTypeMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const auto & column_map = assert_cast(column); - const auto & offsets = column_map.getNestedColumn().getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const auto & nested_data = column_map.getNestedData(); - - writeCString("", ostr); - for (size_t i = offset; i < next_offset; ++i) - { - writeCString("", ostr); - writeCString("", ostr); - key_type->serializeAsTextXML(nested_data.getColumn(0), i, ostr, settings); - writeCString("", ostr); - - writeCString("", ostr); - value_type->serializeAsTextXML(nested_data.getColumn(1), i, ostr, settings); - writeCString("", ostr); - writeCString("", ostr); - } - writeCString("", ostr); -} - -void DataTypeMap::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - WriteBufferFromOwnString wb; - serializeText(column, row_num, wb, settings); - writeCSV(wb.str(), ostr); -} - -void DataTypeMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String s; - readCSV(s, istr, settings.csv); - ReadBufferFromString rb(s); - deserializeText(column, rb, settings); -} - - -void DataTypeMap::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - nested->enumerateStreams(callback, path); -} - -void DataTypeMap::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - nested->serializeBinaryBulkStatePrefix(settings, state); -} - -void DataTypeMap::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - nested->serializeBinaryBulkStateSuffix(settings, state); -} - -void DataTypeMap::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const +SerializationPtr DataTypeMap::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const { - nested->deserializeBinaryBulkStatePrefix(settings, state); -} - - -void DataTypeMap::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - nested->serializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), offset, limit, settings, state); -} - -void DataTypeMap::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - auto & column_map = assert_cast(column); - nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache); + return nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); } MutableColumnPtr DataTypeMap::createColumn() const @@ -361,6 +101,14 @@ Field DataTypeMap::getDefault() const return Map(); } +SerializationPtr DataTypeMap::doGetDefaultSerialization() const +{ + return std::make_shared( + key_type->getDefaultSerialization(), + value_type->getDefaultSerialization(), + nested->getDefaultSerialization()); +} + bool DataTypeMap::equals(const IDataType & rhs) const { if (typeid(rhs) != typeid(*this)) diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 6bce79e6d0c225fdd2fe292469a36ecdde77e5c3..09b8448885a79ad8ab45cb17d432759bc07eb62a 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -11,7 +11,7 @@ namespace DB * Serialization of type 'Map(K, V)' is similar to serialization. * of 'Array(Tuple(keys K, values V))' or in other words of 'Nested(keys K, valuev V)'. */ -class DataTypeMap final : public DataTypeWithSimpleSerialization +class DataTypeMap final : public IDataType { private: DataTypePtr key_type; @@ -34,47 +34,8 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; - - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; MutableColumnPtr createColumn() const override; @@ -88,16 +49,11 @@ public: const DataTypePtr & getKeyType() const { return key_type; } const DataTypePtr & getValueType() const { return value_type; } DataTypes getKeyValueTypes() const { return {key_type, value_type}; } - const DataTypePtr & getNestedType() const { return nested; } -private: - template - void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const; - - template - void deserializeTextImpl(IColumn & column, ReadBuffer & istr, bool need_safe_get_int_key, Reader && reader) const; + SerializationPtr doGetDefaultSerialization() const override; +private: void assertKeyType() const; }; diff --git a/src/DataTypes/DataTypeNested.cpp b/src/DataTypes/DataTypeNested.cpp index cfbfb4c17504e957070d9e658deeb494235a72fd..eba1bba5dfe1ac7e64b3205934f6597caaef99e2 100644 --- a/src/DataTypes/DataTypeNested.cpp +++ b/src/DataTypes/DataTypeNested.cpp @@ -57,7 +57,7 @@ static std::pair create(const ASTPtr & argum auto data_type = std::make_shared(std::make_shared(nested_types, nested_names)); auto custom_name = std::make_unique(nested_types, nested_names); - return std::make_pair(std::move(data_type), std::make_unique(std::move(custom_name), nullptr)); + return std::make_pair(std::move(data_type), std::make_unique(std::move(custom_name))); } void registerDataTypeNested(DataTypeFactory & factory) diff --git a/src/DataTypes/DataTypeNested.h b/src/DataTypes/DataTypeNested.h index 9fb12ad492470751c28e9813cab51690521e5400..1ad06477a6e957241c5ca586114be041ad1a4bd4 100644 --- a/src/DataTypes/DataTypeNested.h +++ b/src/DataTypes/DataTypeNested.h @@ -1,7 +1,6 @@ #pragma once -#include -#include +#include namespace DB diff --git a/src/DataTypes/DataTypeNothing.cpp b/src/DataTypes/DataTypeNothing.cpp index 94a7fd750714a2af1d3817b11f73df44813b7d6a..388a65754b558caae12d667d28feee879ec56de8 100644 --- a/src/DataTypes/DataTypeNothing.cpp +++ b/src/DataTypes/DataTypeNothing.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -14,25 +15,14 @@ MutableColumnPtr DataTypeNothing::createColumn() const return ColumnNothing::create(0); } -void DataTypeNothing::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - size_t size = column.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - for (size_t i = 0; i < limit; ++i) - ostr.write('0'); -} - -void DataTypeNothing::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +bool DataTypeNothing::equals(const IDataType & rhs) const { - typeid_cast(column).addSize(istr.tryIgnore(limit)); + return typeid(rhs) == typeid(*this); } -bool DataTypeNothing::equals(const IDataType & rhs) const +SerializationPtr DataTypeNothing::doGetDefaultSerialization() const { - return typeid(rhs) == typeid(*this); + return std::make_shared(); } diff --git a/src/DataTypes/DataTypeNothing.h b/src/DataTypes/DataTypeNothing.h index e9421fb15e85e929226536a0796a24807b23603e..c7d12388de9577ecf3837fa3275c134fd3751b26 100644 --- a/src/DataTypes/DataTypeNothing.h +++ b/src/DataTypes/DataTypeNothing.h @@ -20,10 +20,6 @@ public: MutableColumnPtr createColumn() const override; - /// These methods read and write zero bytes just to allow to figure out size of column. - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - bool equals(const IDataType & rhs) const override; bool isParametric() const override { return false; } @@ -31,6 +27,8 @@ public: bool haveMaximumSizeOfValue() const override { return true; } size_t getSizeOfValueInMemory() const override { return 0; } bool canBeInsideNullable() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 903ebeb3ddcdd68d0cdb27a31ab0b46afd034c4a..3820a320c6d08314033b4ac2f69b0494619f8bfc 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -2,7 +2,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -22,7 +23,6 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int CANNOT_READ_ALL_DATA; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -42,450 +42,6 @@ bool DataTypeNullable::onlyNull() const } -void DataTypeNullable::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::NullMap); - callback(path, *this); - path.back() = Substream::NullableElements; - nested_data_type->enumerateStreams(callback, path); - path.pop_back(); -} - - -void DataTypeNullable::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::NullableElements); - nested_data_type->serializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::NullableElements); - nested_data_type->serializeBinaryBulkStateSuffix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::NullableElements); - nested_data_type->deserializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - const ColumnNullable & col = assert_cast(column); - col.checkConsistency(); - - /// First serialize null map. - settings.path.push_back(Substream::NullMap); - if (auto * stream = settings.getter(settings.path)) - DataTypeUInt8().serializeBinaryBulk(col.getNullMapColumn(), *stream, offset, limit); - - /// Then serialize contents of arrays. - settings.path.back() = Substream::NullableElements; - nested_data_type->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), offset, limit, settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - ColumnNullable & col = assert_cast(column); - - settings.path.push_back(Substream::NullMap); - if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) - { - col.getNullMapColumnPtr() = cached_column; - } - else if (auto * stream = settings.getter(settings.path)) - { - DataTypeUInt8().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0); - addToSubstreamsCache(cache, settings.path, col.getNullMapColumnPtr()); - } - - settings.path.back() = Substream::NullableElements; - nested_data_type->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumnPtr(), limit, settings, state, cache); - settings.path.pop_back(); -} - - -void DataTypeNullable::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - if (field.isNull()) - { - writeBinary(true, ostr); - } - else - { - writeBinary(false, ostr); - nested_data_type->serializeBinary(field, ostr); - } -} - -void DataTypeNullable::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - bool is_null = false; - readBinary(is_null, istr); - if (!is_null) - { - nested_data_type->deserializeBinary(field, istr); - } - else - { - field = Null(); - } -} - -void DataTypeNullable::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const ColumnNullable & col = assert_cast(column); - - bool is_null = col.isNullAt(row_num); - writeBinary(is_null, ostr); - if (!is_null) - nested_data_type->serializeBinary(col.getNestedColumn(), row_num, ostr); -} - -/// Deserialize value into ColumnNullable. -/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. -template , ReturnType>* = nullptr> -static ReturnType safeDeserialize( - IColumn & column, const IDataType & /*nested_data_type*/, - CheckForNull && check_for_null, DeserializeNested && deserialize_nested) -{ - ColumnNullable & col = assert_cast(column); - - if (check_for_null()) - { - col.insertDefault(); - } - else - { - deserialize_nested(col.getNestedColumn()); - - try - { - col.getNullMapData().push_back(0); - } - catch (...) - { - col.getNestedColumn().popBack(1); - throw; - } - } -} - -/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. -template , ReturnType>* = nullptr> -static ReturnType safeDeserialize( - IColumn & column, const IDataType & nested_data_type, - CheckForNull && check_for_null, DeserializeNested && deserialize_nested) -{ - assert(!dynamic_cast(&column)); - assert(!dynamic_cast(&nested_data_type)); - bool insert_default = check_for_null(); - if (insert_default) - nested_data_type.insertDefaultInto(column); - else - deserialize_nested(column); - return !insert_default; -} - - -void DataTypeNullable::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - safeDeserialize(column, *nested_data_type, - [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, - [this, &istr] (IColumn & nested) { nested_data_type->deserializeBinary(nested, istr); }); -} - - -void DataTypeNullable::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); - else - nested_data_type->serializeAsTextEscaped(col.getNestedColumn(), row_num, ostr, settings); -} - - -void DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - /// Little tricky, because we cannot discriminate null from first character. - - if (istr.eof()) - throw ParsingException("Unexpected end of stream, while parsing value of Nullable type", ErrorCodes::CANNOT_READ_ALL_DATA); - - /// This is not null, surely. - if (*istr.position() != '\\') - { - return safeDeserialize(column, *nested_data_type, - [] { return false; }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsTextEscaped(nested, istr, settings); }); - } - else - { - /// Now we know, that data in buffer starts with backslash. - ++istr.position(); - - if (istr.eof()) - throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA); - - return safeDeserialize(column, *nested_data_type, - [&istr] - { - if (*istr.position() == 'N') - { - ++istr.position(); - return true; - } - return false; - }, - [&nested_data_type, &istr, &settings] (IColumn & nested) - { - if (istr.position() != istr.buffer().begin()) - { - /// We could step back to consume backslash again. - --istr.position(); - nested_data_type->deserializeAsTextEscaped(nested, istr, settings); - } - else - { - /// Otherwise, we need to place backslash back in front of istr. - ReadBufferFromMemory prefix("\\", 1); - ConcatReadBuffer prepended_istr(prefix, istr); - - nested_data_type->deserializeAsTextEscaped(nested, prepended_istr, settings); - - /// Synchronise cursor position in original buffer. - - if (prepended_istr.count() > 1) - istr.position() = prepended_istr.position(); - } - }); - } -} - -void DataTypeNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("NULL", ostr); - else - nested_data_type->serializeAsTextQuoted(col.getNestedColumn(), row_num, ostr, settings); -} - - -void DataTypeNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextQuoted(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - return safeDeserialize(column, *nested_data_type, - [&istr] - { - return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr); - }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsTextQuoted(nested, istr, settings); }); -} - - -void DataTypeNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeWholeText(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - return safeDeserialize(column, *nested_data_type, - [&istr] - { - return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr) - || checkStringByFirstCharacterAndAssertTheRest("ᴺᵁᴸᴸ", istr); - }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsWholeText(nested, istr, settings); }); -} - - -void DataTypeNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("\\N", ostr); - else - nested_data_type->serializeAsTextCSV(col.getNestedColumn(), row_num, ostr, settings); -} - -void DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextCSV(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - constexpr char const * null_literal = "NULL"; - constexpr size_t len = 4; - size_t null_prefix_len = 0; - - auto check_for_null = [&istr, &settings, &null_prefix_len] - { - if (checkStringByFirstCharacterAndAssertTheRest("\\N", istr)) - return true; - if (!settings.csv.unquoted_null_literal_as_null) - return false; - - /// Check for unquoted NULL - while (!istr.eof() && null_prefix_len < len && null_literal[null_prefix_len] == *istr.position()) - { - ++null_prefix_len; - ++istr.position(); - } - if (null_prefix_len == len) - return true; - - /// Value and "NULL" have common prefix, but value is not "NULL". - /// Restore previous buffer position if possible. - if (null_prefix_len <= istr.offset()) - { - istr.position() -= null_prefix_len; - null_prefix_len = 0; - } - return false; - }; - - auto deserialize_nested = [&nested_data_type, &settings, &istr, &null_prefix_len] (IColumn & nested) - { - if (likely(!null_prefix_len)) - nested_data_type->deserializeAsTextCSV(nested, istr, settings); - else - { - /// Previous buffer position was not restored, - /// so we need to prepend extracted characters (rare case) - ReadBufferFromMemory prepend(null_literal, null_prefix_len); - ConcatReadBuffer buf(prepend, istr); - nested_data_type->deserializeAsTextCSV(nested, buf, settings); - - /// Check if all extracted characters were read by nested parser and update buffer position - if (null_prefix_len < buf.count()) - istr.position() = buf.position(); - else if (null_prefix_len > buf.count()) - { - /// It can happen only if there is an unquoted string instead of a number - /// or if someone uses 'U' or 'L' as delimiter in CSV. - /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. - if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L') - throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly " - "with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); - WriteBufferFromOwnString parsed_value; - nested_data_type->serializeAsTextCSV(nested, nested.size() - 1, parsed_value, settings); - throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len) - + std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable(" + nested_data_type->getName() - + ") at position " + std::to_string(istr.count()) + ": expected \"NULL\" or " + nested_data_type->getName() - + ", got \"" + std::string(null_literal, buf.count()) + "\", which was deserialized as \"" - + parsed_value.str() + "\". It seems that input data is ill-formatted.", - ErrorCodes::CANNOT_READ_ALL_DATA); - } - } - }; - - return safeDeserialize(column, *nested_data_type, check_for_null, deserialize_nested); -} - -void DataTypeNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - /// In simple text format (like 'Pretty' format) (these formats are suitable only for output and cannot be parsed back), - /// data is printed without escaping. - /// It makes theoretically impossible to distinguish between NULL and some string value, regardless on how do we print NULL. - /// For this reason, we output NULL in a bit strange way. - /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. - - if (col.isNullAt(row_num)) - { - if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) - writeCString("ᴺᵁᴸᴸ", ostr); - else - writeCString("NULL", ostr); - } - else - nested_data_type->serializeAsText(col.getNestedColumn(), row_num, ostr, settings); -} - -void DataTypeNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("null", ostr); - else - nested_data_type->serializeAsTextJSON(col.getNestedColumn(), row_num, ostr, settings); -} - -void DataTypeNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextJSON(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - return safeDeserialize(column, *nested_data_type, - [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsTextJSON(nested, istr, settings); }); -} - -void DataTypeNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("\\N", ostr); - else - nested_data_type->serializeAsTextXML(col.getNestedColumn(), row_num, ostr, settings); -} - MutableColumnPtr DataTypeNullable::createColumn() const { return ColumnNullable::create(nested_data_type->createColumn(), ColumnUInt8::create()); @@ -510,7 +66,7 @@ bool DataTypeNullable::equals(const IDataType & rhs) const DataTypePtr DataTypeNullable::tryGetSubcolumnType(const String & subcolumn_name) const { if (subcolumn_name == "null") - return createOneElementTuple(std::make_shared(), subcolumn_name, false); + return std::make_shared(); return nested_data_type->tryGetSubcolumnType(subcolumn_name); } @@ -524,6 +80,20 @@ ColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, const IC return nested_data_type->getSubcolumn(subcolumn_name, column_nullable.getNestedColumn()); } +SerializationPtr DataTypeNullable::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +{ + if (subcolumn_name == "null") + return std::make_shared(base_serialization_getter(DataTypeUInt8()), subcolumn_name, false); + + return nested_data_type->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); +} + +SerializationPtr DataTypeNullable::doGetDefaultSerialization() const +{ + return std::make_shared(nested_data_type->getDefaultSerialization()); +} + static DataTypePtr create(const ASTPtr & arguments) { @@ -556,11 +126,4 @@ DataTypePtr removeNullable(const DataTypePtr & type) return type; } - -template bool DataTypeNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); - } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 5e71a1bee4def711627dc1b6a2057b51feb9a1b1..1557179d072212d4b08bab0d97d09c8dc4f9cdfe 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -18,61 +18,6 @@ public: const char * getFamilyName() const override { return "Nullable"; } TypeIndex getTypeId() const override { return TypeIndex::Nullable; } - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - /** It is questionable, how NULL values could be represented in CSV. There are three variants: - * 1. \N - * 2. empty string (without quotes) - * 3. NULL - * We support all of them (however, second variant is supported by CSVRowInputStream, not by deserializeTextCSV). - * (see also input_format_defaults_for_omitted_fields and input_format_csv_unquoted_null_literal_as_null settings) - * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. - */ - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -95,25 +40,16 @@ public: size_t getSizeOfValueInMemory() const override; bool onlyNull() const override; bool canBeInsideLowCardinality() const override { return nested_data_type->canBeInsideLowCardinality(); } + DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; const DataTypePtr & getNestedType() const { return nested_data_type; } - - /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) - /// If ReturnType is void, deserialize Nullable(T) - template - static ReturnType deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); - template - static ReturnType deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); - template - static ReturnType deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); - template - static ReturnType deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); - template - static ReturnType deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); - private: + SerializationPtr doGetDefaultSerialization() const override; + DataTypePtr nested_data_type; }; diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index da603b7155a8e2e9c5e8f9236adaf3720f7e3c99..a9df7db73349ba012681539cb1f01146c2a0d803 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -13,196 +13,12 @@ namespace DB { -template -void DataTypeNumberBase::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeText(assert_cast &>(column).getData()[row_num], ostr); -} - -template -void DataTypeNumberBase::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - T x; - - if constexpr (is_integer_v && is_arithmetic_v) - readIntTextUnsafe(x, istr); - else - readText(x, istr); - - assert_cast &>(column).getData().push_back(x); -} - -template -static inline void writeDenormalNumber(T x, WriteBuffer & ostr) -{ - if constexpr (std::is_floating_point_v) - { - if (std::signbit(x)) - { - if (isNaN(x)) - writeCString("-nan", ostr); - else - writeCString("-inf", ostr); - } - else - { - if (isNaN(x)) - writeCString("nan", ostr); - else - writeCString("inf", ostr); - } - } - else - { - /// This function is not called for non floating point numbers. - (void)x; - } -} - - -template -void DataTypeNumberBase::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - auto x = assert_cast &>(column).getData()[row_num]; - bool is_finite = isFinite(x); - - const bool need_quote = (is_integer_v && (sizeof(T) >= 8) && settings.json.quote_64bit_integers) - || (settings.json.quote_denormals && !is_finite); - - if (need_quote) - writeChar('"', ostr); - - if (is_finite) - writeText(x, ostr); - else if (!settings.json.quote_denormals) - writeCString("null", ostr); - else - writeDenormalNumber(x, ostr); - - if (need_quote) - writeChar('"', ostr); -} - -template -void DataTypeNumberBase::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - bool has_quote = false; - if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. - { - has_quote = true; - ++istr.position(); - } - - FieldType x; - - /// null - if (!has_quote && !istr.eof() && *istr.position() == 'n') - { - ++istr.position(); - assertString("ull", istr); - - x = NaNOrZero(); - } - else - { - static constexpr bool is_uint8 = std::is_same_v; - static constexpr bool is_int8 = std::is_same_v; - - if (is_uint8 || is_int8) - { - // extra conditions to parse true/false strings into 1/0 - if (istr.eof()) - throwReadAfterEOF(); - if (*istr.position() == 't' || *istr.position() == 'f') - { - bool tmp = false; - readBoolTextWord(tmp, istr); - x = tmp; - } - else - readText(x, istr); - } - else - { - readText(x, istr); - } - - if (has_quote) - assertChar('"', istr); - } - - assert_cast &>(column).getData().push_back(x); -} - -template -void DataTypeNumberBase::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - FieldType x; - readCSV(x, istr); - assert_cast &>(column).getData().push_back(x); -} - template Field DataTypeNumberBase::getDefault() const { return NearestFieldType(); } -template -void DataTypeNumberBase::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - /// ColumnVector::ValueType is a narrower type. For example, UInt8, when the Field type is UInt64 - typename ColumnVector::ValueType x = get(field); - writeBinary(x, ostr); -} - -template -void DataTypeNumberBase::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - typename ColumnVector::ValueType x; - readBinary(x, istr); - field = NearestFieldType(x); -} - -template -void DataTypeNumberBase::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - writeBinary(assert_cast &>(column).getData()[row_num], ostr); -} - -template -void DataTypeNumberBase::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - typename ColumnVector::ValueType x; - readBinary(x, istr); - assert_cast &>(column).getData().push_back(x); -} - -template -void DataTypeNumberBase::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&x[offset]), sizeof(typename ColumnVector::ValueType) * limit); -} - -template -void DataTypeNumberBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(typename ColumnVector::ValueType) * limit); - x.resize(initial_size + size / sizeof(typename ColumnVector::ValueType)); -} - - template MutableColumnPtr DataTypeNumberBase::createColumn() const { diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index 22a70ac727785c3856cf4235ef64912b0d48301b..97c3563b032cd97f770db147a5de2878637c3bac 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB @@ -14,7 +14,7 @@ class ColumnVector; /** Implements part of the IDataType interface, common to all numbers and for Date and DateTime. */ template -class DataTypeNumberBase : public DataTypeWithSimpleSerialization +class DataTypeNumberBase : public IDataType { static_assert(IsNumber); @@ -30,21 +30,8 @@ public: const char * getFamilyName() const override { return family_name; } TypeIndex getTypeId() const override { return type_id; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; Field getDefault() const override; - /** Format is platform-dependent. */ - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - MutableColumnPtr createColumn() const override; bool isParametric() const override { return false; } @@ -53,7 +40,7 @@ public: bool shouldAlignRightInPrettyFormats() const override { /// Just a number, without customizations. Counterexample: IPv4. - return !custom_text_serialization; + return !custom_serialization; } bool textCanContainOnlyValidUTF8() const override { return true; } @@ -66,6 +53,8 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(T); } bool isCategorial() const override { return isValueRepresentedByInteger(); } bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { return std::make_shared>(); } }; /// Prevent implicit template instantiation of DataTypeNumberBase for common numeric types diff --git a/src/DataTypes/DataTypeOneElementTuple.cpp b/src/DataTypes/DataTypeOneElementTuple.cpp deleted file mode 100644 index a41692203623d2986ac128f2b63667a4f78e9fcc..0000000000000000000000000000000000000000 --- a/src/DataTypes/DataTypeOneElementTuple.cpp +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace -{ - -/** Custom substreams representation for single subcolumn. - * It serializes/deserializes column as a nested type, but in that way - * if it was a named tuple with one element and a given name. - */ -class DataTypeOneElementTupleStreams : public IDataTypeCustomStreams -{ -private: - DataTypePtr nested; - String name; - bool escape_delimiter; - -public: - DataTypeOneElementTupleStreams(const DataTypePtr & nested_, const String & name_, bool escape_delimiter_) - : nested(nested_), name(name_), escape_delimiter(escape_delimiter_) {} - - void enumerateStreams( - const IDataType::StreamCallback & callback, - IDataType::SubstreamPath & path) const override - { - addToPath(path); - nested->enumerateStreams(callback, path); - path.pop_back(); - } - - void serializeBinaryBulkStatePrefix( - IDataType:: SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->serializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); - } - - void serializeBinaryBulkStateSuffix( - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->serializeBinaryBulkStateSuffix(settings, state); - settings.path.pop_back(); - } - - void deserializeBinaryBulkStatePrefix( - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->deserializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); - } - - void serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); - settings.path.pop_back(); - } - - void deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state, - IDataType::SubstreamsCache * cache) const override - { - addToPath(settings.path); - nested->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); - settings.path.pop_back(); - } - -private: - void addToPath(IDataType::SubstreamPath & path) const - { - path.push_back(IDataType::Substream::TupleElement); - path.back().tuple_element_name = name; - path.back().escape_tuple_delimiter = escape_delimiter; - } -}; - -} - -DataTypePtr createOneElementTuple(const DataTypePtr & type, const String & name, bool escape_delimiter) -{ - auto custom_desc = std::make_unique( - std::make_unique(type->getName()),nullptr, - std::make_unique(type, name, escape_delimiter)); - - return DataTypeFactory::instance().getCustom(std::move(custom_desc)); -} - -} diff --git a/src/DataTypes/DataTypeOneElementTuple.h b/src/DataTypes/DataTypeOneElementTuple.h deleted file mode 100644 index 03b0511ef4abc1f6ae49d11c5604b5e8e7dcf0e5..0000000000000000000000000000000000000000 --- a/src/DataTypes/DataTypeOneElementTuple.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -DataTypePtr createOneElementTuple(const DataTypePtr & type, const String & name, bool escape_delimiter = true); - -} diff --git a/src/DataTypes/DataTypeString.cpp b/src/DataTypes/DataTypeString.cpp index d760df5075d683924d509f9294779c8a7df70931..41ae578a70f472bace33f7e1d79df3c1f1f11a77 100644 --- a/src/DataTypes/DataTypeString.cpp +++ b/src/DataTypes/DataTypeString.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -34,281 +35,6 @@ namespace ErrorCodes extern const int UNEXPECTED_AST_STRUCTURE; } - -void DataTypeString::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const String & s = get(field); - writeVarUInt(s.size(), ostr); - writeString(s, ostr); -} - - -void DataTypeString::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - UInt64 size; - readVarUInt(size, istr); - field = String(); - String & s = get(field); - s.resize(size); - istr.readStrict(s.data(), size); -} - - -void DataTypeString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const StringRef & s = assert_cast(column).getDataAt(row_num); - writeVarUInt(s.size, ostr); - writeString(s, ostr); -} - - -void DataTypeString::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnString & column_string = assert_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - - UInt64 size; - readVarUInt(size, istr); - - size_t old_chars_size = data.size(); - size_t offset = old_chars_size + size + 1; - offsets.push_back(offset); - - try - { - data.resize(offset); - istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); - data.back() = 0; - } - catch (...) - { - offsets.pop_back(); - data.resize_assume_reserved(old_chars_size); - throw; - } -} - - -void DataTypeString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const ColumnString & column_string = typeid_cast(column); - const ColumnString::Chars & data = column_string.getChars(); - const ColumnString::Offsets & offsets = column_string.getOffsets(); - - size_t size = column.size(); - if (!size) - return; - - size_t end = limit && offset + limit < size - ? offset + limit - : size; - - if (offset == 0) - { - UInt64 str_size = offsets[0] - 1; - writeVarUInt(str_size, ostr); - ostr.write(reinterpret_cast(data.data()), str_size); - - ++offset; - } - - for (size_t i = offset; i < end; ++i) - { - UInt64 str_size = offsets[i] - offsets[i - 1] - 1; - writeVarUInt(str_size, ostr); - ostr.write(reinterpret_cast(&data[offsets[i - 1]]), str_size); - } -} - - -template -static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) -{ - size_t offset = data.size(); - for (size_t i = 0; i < limit; ++i) - { - if (istr.eof()) - break; - - UInt64 size; - readVarUInt(size, istr); - - offset += size + 1; - offsets.push_back(offset); - - data.resize(offset); - - if (size) - { -#ifdef __SSE2__ - /// An optimistic branch in which more efficient copying is possible. - if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) - { - const __m128i * sse_src_pos = reinterpret_cast(istr.position()); - const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES; - __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]); - - while (sse_src_pos < sse_src_end) - { - for (size_t j = 0; j < UNROLL_TIMES; ++j) - _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j)); - - sse_src_pos += UNROLL_TIMES; - sse_dst_pos += UNROLL_TIMES; - } - - istr.position() += size; - } - else -#endif - { - istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); - } - } - - data[offset - 1] = 0; - } -} - - -void DataTypeString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const -{ - ColumnString & column_string = typeid_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - - double avg_chars_size = 1; /// By default reserve only for empty strings. - - if (avg_value_size_hint && avg_value_size_hint > sizeof(offsets[0])) - { - /// Randomly selected. - constexpr auto avg_value_size_hint_reserve_multiplier = 1.2; - - avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier; - } - - size_t size_to_reserve = data.size() + std::ceil(limit * avg_chars_size); - - /// Never reserve for too big size. - if (size_to_reserve < 256 * 1024 * 1024) - { - try - { - data.reserve(size_to_reserve); - } - catch (Exception & e) - { - e.addMessage( - "(avg_value_size_hint = " + toString(avg_value_size_hint) - + ", avg_chars_size = " + toString(avg_chars_size) - + ", limit = " + toString(limit) + ")"); - throw; - } - } - - offsets.reserve(offsets.size() + limit); - - if (avg_chars_size >= 64) - deserializeBinarySSE2<4>(data, offsets, istr, limit); - else if (avg_chars_size >= 48) - deserializeBinarySSE2<3>(data, offsets, istr, limit); - else if (avg_chars_size >= 32) - deserializeBinarySSE2<2>(data, offsets, istr, limit); - else - deserializeBinarySSE2<1>(data, offsets, istr, limit); -} - - -void DataTypeString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeEscapedString(assert_cast(column).getDataAt(row_num), ostr); -} - - -template -static inline void read(IColumn & column, Reader && reader) -{ - ColumnString & column_string = assert_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - size_t old_chars_size = data.size(); - size_t old_offsets_size = offsets.size(); - try - { - reader(data); - data.push_back(0); - offsets.push_back(data.size()); - } - catch (...) - { - offsets.resize_assume_reserved(old_offsets_size); - data.resize_assume_reserved(old_chars_size); - throw; - } -} - - -void DataTypeString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); -} - - -void DataTypeString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); -} - - -void DataTypeString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); -} - - -void DataTypeString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(assert_cast(column).getDataAt(row_num), ostr, settings); -} - - -void DataTypeString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); -} - - -void DataTypeString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeXMLStringForTextElement(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeCSVString<>(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); -} - - Field DataTypeString::getDefault() const { return String(); @@ -325,6 +51,11 @@ bool DataTypeString::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeString::doGetDefaultSerialization() const +{ + return std::make_shared(); +} + static DataTypePtr create(const ASTPtr & arguments) { if (arguments && !arguments->children.empty()) diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index 7f8aa1fd0cf9fdd56fcc212051b12ca9f7b52a70..0fc38e9c6f0a9487ded783f6387d0bbb5cb63cbb 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -22,31 +22,6 @@ public: TypeIndex getTypeId() const override { return type_id; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -61,6 +36,8 @@ public: bool isCategorial() const override { return true; } bool canBeInsideNullable() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 5d2050c09e98fd8d2fd9875d8c1f9e315a2f6a2b..b30efb163ab8b311712fc0c170633527669aa95c 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -5,7 +5,9 @@ #include #include #include -#include +#include +#include +#include #include #include #include @@ -29,7 +31,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int DUPLICATE_COLUMN; extern const int EMPTY_DATA_PASSED; - extern const int LOGICAL_ERROR; extern const int NOT_FOUND_COLUMN_IN_BLOCK; extern const int ILLEGAL_COLUMN; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; @@ -113,32 +114,6 @@ static inline const IColumn & extractElementColumn(const IColumn & column, size_ return assert_cast(column).getColumn(idx); } - -void DataTypeTuple::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const auto & tuple = get(field); - for (const auto idx_elem : ext::enumerate(elems)) - idx_elem.second->serializeBinary(tuple[idx_elem.first], ostr); -} - -void DataTypeTuple::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - const size_t size = elems.size(); - - Tuple tuple(size); - for (const auto i : ext::range(0, size)) - elems[i]->deserializeBinary(tuple[i], istr); - - field = tuple; -} - -void DataTypeTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - for (const auto idx_elem : ext::enumerate(elems)) - idx_elem.second->serializeBinary(extractElementColumn(column, idx_elem.first), row_num, ostr); -} - -/// Function must atomically insert values into tuple column template static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) { @@ -178,335 +153,6 @@ static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) } } - -void DataTypeTuple::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - addElementSafe(elems, column, [&] - { - for (const auto & i : ext::range(0, ext::size(elems))) - elems[i]->deserializeBinary(extractElementColumn(column, i), istr); - }); -} - -void DataTypeTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('(', ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeAsTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); - } - writeChar(')', ostr); -} - -void DataTypeTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - const size_t size = elems.size(); - assertChar('(', istr); - - addElementSafe(elems, column, [&] - { - for (const auto i : ext::range(0, size)) - { - skipWhitespaceIfAny(istr); - if (i != 0) - { - assertChar(',', istr); - skipWhitespaceIfAny(istr); - } - elems[i]->deserializeAsTextQuoted(extractElementColumn(column, i), istr, settings); - } - - // Special format for one element tuple (1,) - if (1 == elems.size()) - { - skipWhitespaceIfAny(istr); - // Allow both (1) and (1,) - checkChar(',', istr); - } - - skipWhitespaceIfAny(istr); - assertChar(')', istr); - }); -} - -void DataTypeTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (settings.json.named_tuples_as_objects - && have_explicit_names) - { - writeChar('{', ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - { - writeChar(',', ostr); - } - writeJSONString(names[i], ostr, settings); - writeChar(':', ostr); - elems[i]->serializeAsTextJSON(extractElementColumn(column, i), row_num, ostr, settings); - } - writeChar('}', ostr); - } - else - { - writeChar('[', ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeAsTextJSON(extractElementColumn(column, i), row_num, ostr, settings); - } - writeChar(']', ostr); - } -} - -void DataTypeTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.json.named_tuples_as_objects - && have_explicit_names) - { - skipWhitespaceIfAny(istr); - assertChar('{', istr); - skipWhitespaceIfAny(istr); - - addElementSafe(elems, column, [&] - { - // Require all elements but in arbitrary order. - for (auto i : ext::range(0, ext::size(elems))) - { - if (i > 0) - { - skipWhitespaceIfAny(istr); - assertChar(',', istr); - skipWhitespaceIfAny(istr); - } - - std::string name; - readDoubleQuotedString(name, istr); - skipWhitespaceIfAny(istr); - assertChar(':', istr); - skipWhitespaceIfAny(istr); - - const size_t element_pos = getPositionByName(name); - auto & element_column = extractElementColumn(column, element_pos); - elems[element_pos]->deserializeAsTextJSON(element_column, istr, settings); - } - - skipWhitespaceIfAny(istr); - assertChar('}', istr); - }); - } - else - { - const size_t size = elems.size(); - assertChar('[', istr); - - addElementSafe(elems, column, [&] - { - for (const auto i : ext::range(0, size)) - { - skipWhitespaceIfAny(istr); - if (i != 0) - { - assertChar(',', istr); - skipWhitespaceIfAny(istr); - } - elems[i]->deserializeAsTextJSON(extractElementColumn(column, i), istr, settings); - } - - skipWhitespaceIfAny(istr); - assertChar(']', istr); - }); - } -} - -void DataTypeTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeCString("", ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - writeCString("", ostr); - elems[i]->serializeAsTextXML(extractElementColumn(column, i), row_num, ostr, settings); - writeCString("", ostr); - } - writeCString("", ostr); -} - -void DataTypeTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeAsTextCSV(extractElementColumn(column, i), row_num, ostr, settings); - } -} - -void DataTypeTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - addElementSafe(elems, column, [&] - { - const size_t size = elems.size(); - for (const auto i : ext::range(0, size)) - { - if (i != 0) - { - skipWhitespaceIfAny(istr); - assertChar(settings.csv.delimiter, istr); - skipWhitespaceIfAny(istr); - } - elems[i]->deserializeAsTextCSV(extractElementColumn(column, i), istr, settings); - } - }); -} - -void DataTypeTuple::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::TupleElement); - for (const auto i : ext::range(0, ext::size(elems))) - { - path.back().tuple_element_name = names[i]; - elems[i]->enumerateStreams(callback, path); - } - path.pop_back(); -} - -struct SerializeBinaryBulkStateTuple : public IDataType::SerializeBinaryBulkState -{ - std::vector states; -}; - -struct DeserializeBinaryBulkStateTuple : public IDataType::DeserializeBinaryBulkState -{ - std::vector states; -}; - -static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(IDataType::SerializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); - - auto * tuple_state = typeid_cast(state.get()); - if (!tuple_state) - { - auto & state_ref = *state; - throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: " - + demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return tuple_state; -} - -static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(IDataType::DeserializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); - - auto * tuple_state = typeid_cast(state.get()); - if (!tuple_state) - { - auto & state_ref = *state; - throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: " - + demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return tuple_state; -} - -void DataTypeTuple::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto tuple_state = std::make_shared(); - tuple_state->states.resize(elems.size()); - - settings.path.push_back(Substream::TupleElement); - for (size_t i = 0; i < elems.size(); ++i) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->serializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); - } - settings.path.pop_back(); - - state = std::move(tuple_state); -} - -void DataTypeTuple::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto * tuple_state = checkAndGetTupleSerializeState(state); - - settings.path.push_back(Substream::TupleElement); - for (size_t i = 0; i < elems.size(); ++i) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]); - } - settings.path.pop_back(); -} - -void DataTypeTuple::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - auto tuple_state = std::make_shared(); - tuple_state->states.resize(elems.size()); - - settings.path.push_back(Substream::TupleElement); - for (size_t i = 0; i < elems.size(); ++i) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); - } - settings.path.pop_back(); - - state = std::move(tuple_state); -} - -void DataTypeTuple::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto * tuple_state = checkAndGetTupleSerializeState(state); - - settings.path.push_back(Substream::TupleElement); - for (const auto i : ext::range(0, ext::size(elems))) - { - settings.path.back().tuple_element_name = names[i]; - const auto & element_col = extractElementColumn(column, i); - elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]); - } - settings.path.pop_back(); -} - -void DataTypeTuple::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - auto * tuple_state = checkAndGetTupleDeserializeState(state); - auto & column_tuple = assert_cast(column); - - settings.path.push_back(Substream::TupleElement); - settings.avg_value_size_hint = 0; - for (const auto i : ext::range(0, ext::size(elems))) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache); - } - settings.path.pop_back(); -} - MutableColumnPtr DataTypeTuple::createColumn() const { size_t size = elems.size(); @@ -590,48 +236,99 @@ size_t DataTypeTuple::getSizeOfValueInMemory() const return res; } -DataTypePtr DataTypeTuple::tryGetSubcolumnType(const String & subcolumn_name) const +template +auto DataTypeTuple::getSubcolumnEntity(const String & subcolumn_name, + const OnSuccess & on_success, const OnContinue & on_continue) const { + using ReturnType = decltype(on_success(0)); for (size_t i = 0; i < names.size(); ++i) { if (startsWith(subcolumn_name, names[i])) { size_t name_length = names[i].size(); - DataTypePtr subcolumn_type; + if (subcolumn_name.size() == name_length) - subcolumn_type = elems[i]; - else if (subcolumn_name[name_length] == '.') - subcolumn_type = elems[i]->tryGetSubcolumnType(subcolumn_name.substr(name_length + 1)); + return on_success(i); - if (subcolumn_type) - return createOneElementTuple(std::move(subcolumn_type), names[i]); + if (subcolumn_name[name_length] == '.') + return on_continue(i, subcolumn_name.substr(name_length + 1)); } } - return nullptr; + return ReturnType{}; +} + +DataTypePtr DataTypeTuple::tryGetSubcolumnType(const String & subcolumn_name) const +{ + if (subcolumn_name == MAIN_SUBCOLUMN_NAME) + return shared_from_this(); + + auto on_success = [&](size_t pos) { return elems[pos]; }; + auto on_continue = [&](size_t pos, const String & next_subcolumn) { return elems[pos]->tryGetSubcolumnType(next_subcolumn); }; + + return getSubcolumnEntity(subcolumn_name, on_success, on_continue); } ColumnPtr DataTypeTuple::getSubcolumn(const String & subcolumn_name, const IColumn & column) const { - for (size_t i = 0; i < names.size(); ++i) + auto on_success = [&](size_t pos) { return extractElementColumn(column, pos).getPtr(); }; + auto on_continue = [&](size_t pos, const String & next_subcolumn) { - if (startsWith(subcolumn_name, names[i])) - { - size_t name_length = names[i].size(); - const auto & subcolumn = extractElementColumn(column, i); + return elems[pos]->getSubcolumn(next_subcolumn, extractElementColumn(column, pos)); + }; - if (subcolumn_name.size() == name_length) - return subcolumn.assumeMutable(); + if (auto subcolumn = getSubcolumnEntity(subcolumn_name, on_success, on_continue)) + return subcolumn; - if (subcolumn_name[name_length] == '.') - return elems[i]->getSubcolumn(subcolumn_name.substr(name_length + 1), subcolumn); - } - } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +} + +SerializationPtr DataTypeTuple::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +{ + auto on_success = [&](size_t pos) + { + return std::make_shared(base_serialization_getter(*elems[pos]), names[pos]); + }; + + auto on_continue = [&](size_t pos, const String & next_subcolumn) + { + auto next_serialization = elems[pos]->getSubcolumnSerialization(next_subcolumn, base_serialization_getter); + return std::make_shared(next_serialization, names[pos]); + }; + + if (auto serialization = getSubcolumnEntity(subcolumn_name, on_success, on_continue)) + return serialization; throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); } +SerializationPtr DataTypeTuple::doGetDefaultSerialization() const +{ + SerializationTuple::ElementSerializations serializations(elems.size()); + for (size_t i = 0; i < elems.size(); ++i) + { + auto serialization = elems[i]->getDefaultSerialization(); + serializations[i] = std::make_shared(serialization, names[i]); + } + + return std::make_shared(std::move(serializations), have_explicit_names); +} + +SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const StreamExistenceCallback & callback) const +{ + SerializationTuple::ElementSerializations serializations(elems.size()); + for (size_t i = 0; i < elems.size(); ++i) + { + auto subcolumn_name = Nested::concatenateName(column_name, names[i]); + auto serializaion = elems[i]->getSerialization(subcolumn_name, callback); + serializations[i] = std::make_shared(serializaion, names[i]); + } + + return std::make_shared(std::move(serializations), have_explicit_names); +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.empty()) diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 12ccf574c0e10d0ad89549dc851efcd9d91484e4..e572b23f987d37d20c8a9dbba4cf678b139840da 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -16,7 +16,7 @@ namespace DB * * All tuples with same size and types of elements are equivalent for expressions, regardless to names of elements. */ -class DataTypeTuple final : public DataTypeWithSimpleSerialization +class DataTypeTuple final : public IDataType { private: DataTypes elems; @@ -37,50 +37,6 @@ public: bool canBeInsideNullable() const override { return false; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - /** Each sub-column in a tuple is serialized in separate stream. - */ - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -99,6 +55,13 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; + SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const override; + + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + + SerializationPtr doGetDefaultSerialization() const override; + const DataTypes & getElements() const { return elems; } const Strings & getElementNames() const { return names; } @@ -106,6 +69,11 @@ public: bool haveExplicitNames() const { return have_explicit_names; } bool serializeNames() const { return serialize_names; } + +private: + template + auto getSubcolumnEntity(const String & subcolumn_name, + const OnSuccess & on_success, const OnContinue & on_continue) const; }; } diff --git a/src/DataTypes/DataTypeUUID.cpp b/src/DataTypes/DataTypeUUID.cpp index b66cbadaef00833bc3c7e9e3bd8fbbf19b544886..387ccc56a71259e28bd5215543d3dd3881be45e7 100644 --- a/src/DataTypes/DataTypeUUID.cpp +++ b/src/DataTypes/DataTypeUUID.cpp @@ -1,87 +1,20 @@ #include #include -#include -#include -#include -#include +#include namespace DB { -void DataTypeUUID::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeText(UUID(assert_cast(column).getData()[row_num]), ostr); -} - -void DataTypeUUID::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID x; - readText(x, istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeText(column, istr, settings); -} - -void DataTypeUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -void DataTypeUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID x; - assertChar('\'', istr); - readText(x, istr); - assertChar('\'', istr); - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID x; - assertChar('"', istr); - readText(x, istr); - assertChar('"', istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID value; - readCSV(value, istr); - assert_cast(column).getData().push_back(value); -} - bool DataTypeUUID::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeUUID::doGetDefaultSerialization() const +{ + return std::make_shared(); +} void registerDataTypeUUID(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index de0c7c7d8cf7444c1e4c02a1dcdbd33ec7e4a504..1546ca385a44b3a836635fdff2dc94c1ecc3e2cd 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -16,22 +16,13 @@ public: bool equals(const IDataType & rhs) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool canBeUsedInBitOperations() const override { return true; } bool canBeInsideNullable() const override { return true; } bool canBeInsideLowCardinality() const override { return false; } bool canBePromoted() const override { return false; } + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypesDecimal.cpp b/src/DataTypes/DataTypesDecimal.cpp index 160e09d92d8a982127a7bd333566685477046650..cecfcea8dac48ee2242ffd490961a51e60dc1bf7 100644 --- a/src/DataTypes/DataTypesDecimal.cpp +++ b/src/DataTypes/DataTypesDecimal.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -46,55 +47,6 @@ DataTypePtr DataTypeDecimal::promoteNumericType() const return std::make_shared(PromotedType::maxPrecision(), this->scale); } -template -void DataTypeDecimal::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - T value = assert_cast(column).getData()[row_num]; - writeText(value, this->scale, ostr); -} - -template -bool DataTypeDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) -{ - UInt32 unread_scale = scale; - if (!tryReadDecimalText(istr, x, precision, unread_scale)) - return false; - - if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) - return false; - - return true; -} - -template -void DataTypeDecimal::readText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) -{ - UInt32 unread_scale = scale; - if (csv) - readCSVDecimalText(istr, x, precision, unread_scale); - else - readDecimalText(istr, x, precision, unread_scale); - - if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) - throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); -} - -template -void DataTypeDecimal::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - T x; - readText(x, istr); - assert_cast(column).getData().push_back(x); -} - -template -void DataTypeDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - T x; - readText(x, istr, true); - assert_cast(column).getData().push_back(x); -} - template T DataTypeDecimal::parseFromString(const String & str) const { @@ -109,6 +61,12 @@ T DataTypeDecimal::parseFromString(const String & str) const return x; } +template +SerializationPtr DataTypeDecimal::doGetDefaultSerialization() const +{ + return std::make_shared>(this->precision, this->scale); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypesDecimal.h b/src/DataTypes/DataTypesDecimal.h index 2b708b53be0f48dbaa770d755b6224a4cfe4a314..5aeac78b2efacfdb39bbffb97da7588d1ef62a04 100644 --- a/src/DataTypes/DataTypesDecimal.h +++ b/src/DataTypes/DataTypesDecimal.h @@ -42,17 +42,9 @@ public: bool canBePromoted() const override { return true; } DataTypePtr promoteNumericType() const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool equals(const IDataType & rhs) const override; - T parseFromString(const String & str) const; - void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } - - static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); - static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); + SerializationPtr doGetDefaultSerialization() const override; }; template diff --git a/src/DataTypes/DataTypesNumber.h b/src/DataTypes/DataTypesNumber.h index 0ec655bde507e9348445cf62c1390cc750a90e9c..79272756465022d6b99db5b94ce87adb5c857e92 100644 --- a/src/DataTypes/DataTypesNumber.h +++ b/src/DataTypes/DataTypesNumber.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -25,6 +26,11 @@ class DataTypeNumber final : public DataTypeNumberBase using PromotedType = DataTypeNumber>; return std::make_shared(); } + + SerializationPtr doGetDefaultSerialization() const override + { + return std::make_shared>(); + } }; using DataTypeUInt8 = DataTypeNumber; diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d4ca7e4dfddb3ae2aa28637c74bb46d639d90054 --- /dev/null +++ b/src/DataTypes/EnumValues.cpp @@ -0,0 +1,77 @@ +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; + extern const int EMPTY_DATA_PASSED; + extern const int BAD_ARGUMENTS; +} + +template +EnumValues::EnumValues(const Values & values_) + : values(values_) +{ + if (values.empty()) + throw Exception{"DataTypeEnum enumeration cannot be empty", ErrorCodes::EMPTY_DATA_PASSED}; + + std::sort(std::begin(values), std::end(values), [] (auto & left, auto & right) + { + return left.second < right.second; + }); + + fillMaps(); +} + +template +void EnumValues::fillMaps() +{ + for (const auto & name_and_value : values) + { + const auto inserted_value = name_to_value_map.insert( + { StringRef{name_and_value.first}, name_and_value.second }); + + if (!inserted_value.second) + throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) + + " and " + toString(inserted_value.first->getMapped()), + ErrorCodes::SYNTAX_ERROR}; + + const auto inserted_name = value_to_name_map.insert( + { name_and_value.second, StringRef{name_and_value.first} }); + + if (!inserted_name.second) + throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) + + " and '" + toString((*inserted_name.first).first) + "'", + ErrorCodes::SYNTAX_ERROR}; + } +} + +template +T EnumValues::getValue(StringRef field_name, bool try_treat_as_id) const +{ + const auto it = name_to_value_map.find(field_name); + if (!it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names, we will try to treat it as enum id. + if (try_treat_as_id) + { + T x; + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + readText(x, tmp_buf); + /// Check if we reached end of the tmp_buf (otherwise field_name is not a number) + /// and try to find it in enum ids + if (tmp_buf.eof() && value_to_name_map.find(x) != value_to_name_map.end()) + return x; + } + throw Exception{"Unknown element '" + field_name.toString() + "' for enum", ErrorCodes::BAD_ARGUMENTS}; + } + return it->getMapped(); +} + +template class EnumValues; +template class EnumValues; + +} diff --git a/src/DataTypes/EnumValues.h b/src/DataTypes/EnumValues.h new file mode 100644 index 0000000000000000000000000000000000000000..45ac30f9cd74237a31ac73c4d63882c1594a3869 --- /dev/null +++ b/src/DataTypes/EnumValues.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +template +class EnumValues +{ +public: + using Value = std::pair; + using Values = std::vector; + using NameToValueMap = HashMap; + using ValueToNameMap = std::unordered_map; + +private: + Values values; + NameToValueMap name_to_value_map; + ValueToNameMap value_to_name_map; + + void fillMaps(); + +public: + EnumValues(const Values & values_); + + const Values & getValues() const { return values; } + + auto findByValue(const T & value) const + { + const auto it = value_to_name_map.find(value); + if (it == std::end(value_to_name_map)) + throw Exception{"Unexpected value " + toString(value) + " in enum", ErrorCodes::BAD_ARGUMENTS}; + + return it; + } + + const StringRef & getNameForValue(const T & value) const + { + return findByValue(value)->second; + } + + T getValue(StringRef field_name, bool try_treat_as_id = false) const; + + template + bool containsAll(const TValues & rhs_values) const + { + auto check = [&](const auto & value) + { + auto it = name_to_value_map.find(value.first); + /// If we don't have this name, than we have to be sure, + /// that this value exists in enum + if (it == name_to_value_map.end()) + return value_to_name_map.count(value.second) > 0; + + /// If we have this name, than it should have the same value + return it->value.second == value.second; + }; + + return std::all_of(rhs_values.begin(), rhs_values.end(), check); + } +}; + +} + diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 5582a8698e08727ad40c7fadf786795fcaccb37a..c0679557ec9cfdee89d09778e447c0bbfd882d15 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -18,54 +19,11 @@ namespace DB namespace ErrorCodes { - extern const int MULTIPLE_STREAMS_REQUIRED; extern const int LOGICAL_ERROR; extern const int DATA_TYPE_CANNOT_BE_PROMOTED; extern const int ILLEGAL_COLUMN; } -String IDataType::Substream::toString() const -{ - switch (type) - { - case ArrayElements: - return "ArrayElements"; - case ArraySizes: - return "ArraySizes"; - case NullableElements: - return "NullableElements"; - case NullMap: - return "NullMap"; - case TupleElement: - return "TupleElement(" + tuple_element_name + ", " - + std::to_string(escape_tuple_delimiter) + ")"; - case DictionaryKeys: - return "DictionaryKeys"; - case DictionaryIndexes: - return "DictionaryIndexes"; - } - - __builtin_unreachable(); -} - -String IDataType::SubstreamPath::toString() const -{ - WriteBufferFromOwnString wb; - wb << "{"; - for (size_t i = 0; i < size(); ++i) - { - if (i != 0) - wb << ", "; - wb << at(i).toString(); - } - wb << "}"; - return wb.str(); -} - -IDataType::IDataType() : custom_name(nullptr), custom_text_serialization(nullptr), custom_streams(nullptr) -{ -} - IDataType::~IDataType() = default; String IDataType::getName() const @@ -119,19 +77,17 @@ DataTypePtr IDataType::promoteNumericType() const throw Exception("Data type " + getName() + " can't be promoted.", ErrorCodes::DATA_TYPE_CANNOT_BE_PROMOTED); } -void IDataType::serializeBinaryBulk(const IColumn &, WriteBuffer &, size_t, size_t) const +size_t IDataType::getSizeOfValueInMemory() const { - throw Exception("Data type " + getName() + " must be serialized with multiple streams", ErrorCodes::MULTIPLE_STREAMS_REQUIRED); + throw Exception("Value of type " + getName() + " in memory is not of fixed size.", ErrorCodes::LOGICAL_ERROR); } -void IDataType::deserializeBinaryBulk(IColumn &, ReadBuffer &, size_t, double) const +DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const { - throw Exception("Data type " + getName() + " must be deserialized with multiple streams", ErrorCodes::MULTIPLE_STREAMS_REQUIRED); -} + if (subcolumn_name == MAIN_SUBCOLUMN_NAME) + return shared_from_this(); -size_t IDataType::getSizeOfValueInMemory() const -{ - throw Exception("Value of type " + getName() + " in memory is not of fixed size.", ErrorCodes::LOGICAL_ERROR); + return nullptr; } DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const @@ -151,14 +107,14 @@ ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const IColumn & Names IDataType::getSubcolumnNames() const { NameSet res; - enumerateStreams([&res, this](const SubstreamPath & substream_path, const IDataType & /* substream_type */) + getDefaultSerialization()->enumerateStreams([&res, this](const ISerialization::SubstreamPath & substream_path) { - SubstreamPath new_path; + ISerialization::SubstreamPath new_path; /// Iterate over path to try to get intermediate subcolumns for complex nested types. for (const auto & elem : substream_path) { new_path.push_back(elem); - auto subcolumn_name = getSubcolumnNameForStream(new_path); + auto subcolumn_name = ISerialization::getSubcolumnNameForStream(new_path); if (!subcolumn_name.empty() && tryGetSubcolumnType(subcolumn_name)) res.insert(subcolumn_name); } @@ -167,287 +123,72 @@ Names IDataType::getSubcolumnNames() const return Names(std::make_move_iterator(res.begin()), std::make_move_iterator(res.end())); } -static String getNameForSubstreamPath( - String stream_name, - const IDataType::SubstreamPath & path, - bool escape_tuple_delimiter) -{ - size_t array_level = 0; - for (const auto & elem : path) - { - if (elem.type == IDataType::Substream::NullMap) - stream_name += ".null"; - else if (elem.type == IDataType::Substream::ArraySizes) - stream_name += ".size" + toString(array_level); - else if (elem.type == IDataType::Substream::ArrayElements) - ++array_level; - else if (elem.type == IDataType::Substream::DictionaryKeys) - stream_name += ".dict"; - else if (elem.type == IDataType::Substream::TupleElement) - { - /// For compatibility reasons, we use %2E (escaped dot) instead of dot. - /// Because nested data may be represented not by Array of Tuple, - /// but by separate Array columns with names in a form of a.b, - /// and name is encoded as a whole. - stream_name += (escape_tuple_delimiter && elem.escape_tuple_delimiter ? - escapeForFileName(".") : ".") + escapeForFileName(elem.tuple_element_name); - } - } - - return stream_name; -} - -String IDataType::getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path) -{ - auto name_in_storage = column.getNameInStorage(); - auto nested_storage_name = Nested::extractTableName(name_in_storage); - - if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes)) - name_in_storage = nested_storage_name; - - auto stream_name = escapeForFileName(name_in_storage); - return getNameForSubstreamPath(std::move(stream_name), path, true); -} - -String IDataType::getSubcolumnNameForStream(const SubstreamPath & path) -{ - auto subcolumn_name = getNameForSubstreamPath("", path, false); - if (!subcolumn_name.empty()) - subcolumn_name = subcolumn_name.substr(1); // It starts with a dot. - - return subcolumn_name; -} - -bool IDataType::isSpecialCompressionAllowed(const SubstreamPath & path) -{ - for (const Substream & elem : path) - { - if (elem.type == Substream::NullMap - || elem.type == Substream::ArraySizes - || elem.type == Substream::DictionaryIndexes) - return false; - } - return true; -} - void IDataType::insertDefaultInto(IColumn & column) const { column.insertDefault(); } -void IDataType::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const -{ - if (custom_streams) - custom_streams->enumerateStreams(callback, path); - else - enumerateStreamsImpl(callback, path); -} - -void IDataType::serializeBinaryBulkStatePrefix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const +void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const { - if (custom_streams) - custom_streams->serializeBinaryBulkStatePrefix(settings, state); - else - serializeBinaryBulkStatePrefixImpl(settings, state); -} + /// replace only if not null + if (custom_desc_->name) + custom_name = std::move(custom_desc_->name); -void IDataType::serializeBinaryBulkStateSuffix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - if (custom_streams) - custom_streams->serializeBinaryBulkStateSuffix(settings, state); - else - serializeBinaryBulkStateSuffixImpl(settings, state); + if (custom_desc_->serialization) + custom_serialization = std::move(custom_desc_->serialization); } -void IDataType::deserializeBinaryBulkStatePrefix( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const +SerializationPtr IDataType::getDefaultSerialization() const { - if (custom_streams) - custom_streams->deserializeBinaryBulkStatePrefix(settings, state); - else - deserializeBinaryBulkStatePrefixImpl(settings, state); -} + if (custom_serialization) + return custom_serialization; -void IDataType::serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - if (custom_streams) - custom_streams->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); - else - serializeBinaryBulkWithMultipleStreamsImpl(column, offset, limit, settings, state); + return doGetDefaultSerialization(); } -void IDataType::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & /* state */, - SubstreamsCache * /* cache */) const +SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const BaseSerializationGetter &) const { - if (ReadBuffer * stream = settings.getter(settings.path)) - deserializeBinaryBulk(column, *stream, limit, settings.avg_value_size_hint); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); } - -void IDataType::deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const +// static +SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const IDataType::StreamExistenceCallback & callback) { - if (custom_streams) - { - custom_streams->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); - return; - } - - /// Do not cache complex type, because they can be constructed - /// from their subcolumns, which are in cache. - if (!haveSubtypes()) + if (column.isSubcolumn()) { - auto cached_column = getFromSubstreamsCache(cache, settings.path); - if (cached_column) + /// Wrap to custom serialization deepest subcolumn, which is represented in non-complex type. + auto base_serialization_getter = [&](const IDataType & subcolumn_type) { - column = cached_column; - return; - } - } - - auto mutable_column = column->assumeMutable(); - deserializeBinaryBulkWithMultipleStreamsImpl(*mutable_column, limit, settings, state, cache); - column = std::move(mutable_column); - - if (!haveSubtypes()) - addToSubstreamsCache(cache, settings.path, column); -} - -void IDataType::serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextEscaped(column, row_num, ostr, settings); - else - serializeTextEscaped(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextEscaped(column, istr, settings); - else - deserializeTextEscaped(column, istr, settings); -} - -void IDataType::serializeAsTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextQuoted(column, row_num, ostr, settings); - else - serializeTextQuoted(column, row_num, ostr, settings); -} + return subcolumn_type.getSerialization(column.name, callback); + }; -void IDataType::deserializeAsTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextQuoted(column, istr, settings); - else - deserializeTextQuoted(column, istr, settings); -} - -void IDataType::serializeAsTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextCSV(column, row_num, ostr, settings); - else - serializeTextCSV(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextCSV(column, istr, settings); - else - deserializeTextCSV(column, istr, settings); -} + auto type_in_storage = column.getTypeInStorage(); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), base_serialization_getter); + } -void IDataType::serializeAsText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeText(column, row_num, ostr, settings); - else - serializeText(column, row_num, ostr, settings); + return column.type->getSerialization(column.name, callback); } -void IDataType::deserializeAsWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +SerializationPtr IDataType::getSerialization(const String &, const StreamExistenceCallback &) const { - if (custom_text_serialization) - custom_text_serialization->deserializeWholeText(column, istr, settings); - else - deserializeWholeText(column, istr, settings); + return getDefaultSerialization(); } -void IDataType::serializeAsTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +DataTypePtr IDataType::getTypeForSubstream(const ISerialization::SubstreamPath & substream_path) const { - if (custom_text_serialization) - custom_text_serialization->serializeTextJSON(column, row_num, ostr, settings); - else - serializeTextJSON(column, row_num, ostr, settings); -} + auto type = tryGetSubcolumnType(ISerialization::getSubcolumnNameForStream(substream_path)); + if (type) + return type->getSubcolumnType(MAIN_SUBCOLUMN_NAME); -void IDataType::deserializeAsTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextJSON(column, istr, settings); - else - deserializeTextJSON(column, istr, settings); + return getSubcolumnType(MAIN_SUBCOLUMN_NAME); } -void IDataType::serializeAsTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextXML(column, row_num, ostr, settings); - else - serializeTextXML(column, row_num, ostr, settings); -} - -void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const +void IDataType::enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback, ISerialization::SubstreamPath & path) const { - /// replace only if not null - if (custom_desc_->name) - custom_name = std::move(custom_desc_->name); - - if (custom_desc_->text_serialization) - custom_text_serialization = std::move(custom_desc_->text_serialization); - - if (custom_desc_->streams) - custom_streams = std::move(custom_desc_->streams); -} - -void IDataType::addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column) -{ - if (cache && !path.empty()) - cache->emplace(getSubcolumnNameForStream(path), column); -} - -ColumnPtr IDataType::getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path) -{ - if (!cache || path.empty()) - return nullptr; - - auto it = cache->find(getSubcolumnNameForStream(path)); - if (it == cache->end()) - return nullptr; - - return it->second; + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + { + callback(substream_path, *getTypeForSubstream(substream_path)); + }, path); } } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index c9c848a8037353681e28d3ee2875942401bcdf4d..9b762cfa4c5ab2cb75678a6706e595ee1e3952fa 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace DB @@ -27,19 +27,25 @@ using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; struct NameAndTypePair; +class SerializationInfo; /** Properties of data type. - * Contains methods for serialization/deserialization. + * + * Contains methods for getting serialization instances. + * One data type may have different serializations, which can be chosen + * dynamically before reading or writing, according to information about + * column content (see `getSerialization` methods). + * * Implementations of this interface represent a data type (example: UInt8) * or parametric family of data types (example: Array(...)). * * DataType is totally immutable object. You can always share them. */ -class IDataType : private boost::noncopyable +class IDataType : private boost::noncopyable, public std::enable_shared_from_this { public: - IDataType(); + IDataType() = default; virtual ~IDataType(); /// Compile time flag. If false, then if C++ types are the same, then SQL types are also the same. @@ -57,275 +63,47 @@ public: /// Data type id. It's used for runtime type checks. virtual TypeIndex getTypeId() const = 0; - /** Binary serialization for range of values in column - for writing to disk/network, etc. - * - * Some data types are represented in multiple streams while being serialized. - * Example: - * - Arrays are represented as stream of all elements and stream of array sizes. - * - Nullable types are represented as stream of values (with unspecified values in place of NULLs) and stream of NULL flags. - * - * Different streams are identified by "path". - * If the data type require single stream (it's true for most of data types), the stream will have empty path. - * Otherwise, the path can have components like "array elements", "array sizes", etc. - * - * For multidimensional arrays, path can have arbitrary length. - * As an example, for 2-dimensional arrays of numbers we have at least three streams: - * - array sizes; (sizes of top level arrays) - * - array elements / array sizes; (sizes of second level (nested) arrays) - * - array elements / array elements; (the most deep elements, placed contiguously) - * - * Descendants must override either serializeBinaryBulk, deserializeBinaryBulk methods (for simple cases with single stream) - * or serializeBinaryBulkWithMultipleStreams, deserializeBinaryBulkWithMultipleStreams, enumerateStreams methods (for cases with multiple streams). - * - * Default implementations of ...WithMultipleStreams methods will call serializeBinaryBulk, deserializeBinaryBulk for single stream. - */ - - struct Substream - { - enum Type - { - ArrayElements, - ArraySizes, - - NullableElements, - NullMap, - - TupleElement, - - DictionaryKeys, - DictionaryIndexes, - }; - Type type; - - /// Index of tuple element, starting at 1 or name. - String tuple_element_name; - - /// Do we need to escape a dot in filenames for tuple elements. - bool escape_tuple_delimiter = true; - - Substream(Type type_) : type(type_) {} - - String toString() const; - }; - - struct SubstreamPath : public std::vector - { - String toString() const; - }; - - /// Cache for common substreams of one type, but possible different its subcolumns. - /// E.g. sizes of arrays of Nested data type. - using SubstreamsCache = std::unordered_map; - - using StreamCallback = std::function; - - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const; - void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } - void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } - - virtual DataTypePtr tryGetSubcolumnType(const String & /* subcolumn_name */) const { return nullptr; } + static constexpr auto MAIN_SUBCOLUMN_NAME = "__main"; + virtual DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const; DataTypePtr getSubcolumnType(const String & subcolumn_name) const; virtual ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const; Names getSubcolumnNames() const; - using OutputStreamGetter = std::function; - using InputStreamGetter = std::function; + /// Returns default serialization of data type. + SerializationPtr getDefaultSerialization() const; - struct SerializeBinaryBulkState - { - virtual ~SerializeBinaryBulkState() = default; - }; - struct DeserializeBinaryBulkState - { - virtual ~DeserializeBinaryBulkState() = default; - }; + /// Asks whether the stream with given name exists in table. + /// If callback returned true for all streams, which are required for + /// one of serialization types, that serialization will be chosen for reading. + /// If callback always returned false, the default serialization will be chosen. + using StreamExistenceCallback = std::function; + using BaseSerializationGetter = std::function; - using SerializeBinaryBulkStatePtr = std::shared_ptr; - using DeserializeBinaryBulkStatePtr = std::shared_ptr; + /// Chooses serialization for reading of one column or subcolumns by + /// checking existence of substreams using callback. + static SerializationPtr getSerialization( + const NameAndTypePair & column, + const StreamExistenceCallback & callback = [](const String &) { return false; }); - struct SerializeBinaryBulkSettings - { - OutputStreamGetter getter; - SubstreamPath path; + virtual SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const; - size_t low_cardinality_max_dictionary_size = 0; - bool low_cardinality_use_single_dictionary_for_part = true; + /// Returns serialization wrapper for reading one particular subcolumn of data type. + virtual SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const; - bool position_independent_encoding = true; - }; + using StreamCallbackWithType = std::function; - struct DeserializeBinaryBulkSettings - { - InputStreamGetter getter; - SubstreamPath path; - - /// True if continue reading from previous positions in file. False if made fseek to the start of new granule. - bool continuous_reading = true; - - bool position_independent_encoding = true; - /// If not zero, may be used to avoid reallocations while reading column of String type. - double avg_value_size_hint = 0; - }; - - /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. - void serializeBinaryBulkStatePrefix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const; - - /// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization. - void serializeBinaryBulkStateSuffix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const; - - /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. - void deserializeBinaryBulkStatePrefix( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const; - - /** 'offset' and 'limit' are used to specify range. - * limit = 0 - means no limit. - * offset must be not greater than size of column. - * offset + limit could be greater than size of column - * - in that case, column is serialized till the end. - */ - void serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const; - - /// Read no more than limit values and append them into column. - void deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache = nullptr) const; - - /** Override these methods for data types that require just single stream (most of data types). - */ - virtual void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const; - virtual void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const; - - /** Serialization/deserialization of individual values. - * - * These are helper methods for implementation of various formats to input/output for user (like CSV, JSON, etc.). - * There is no one-to-one correspondence between formats and these methods. - * For example, TabSeparated and Pretty formats could use same helper method serializeTextEscaped. - * - * For complex data types (like arrays) binary serde for individual values may differ from bulk serde. - * For example, if you serialize single array, it will be represented as its size and elements in single contiguous stream, - * but if you bulk serialize column with arrays, then sizes and elements will be written to separate streams. - */ - - /// There is two variants for binary serde. First variant work with Field. - virtual void serializeBinary(const Field & field, WriteBuffer & ostr) const = 0; - virtual void deserializeBinary(Field & field, ReadBuffer & istr) const = 0; - - /// Other variants takes a column, to avoid creating temporary Field object. - /// Column must be non-constant. - - /// Serialize one value of a column at specified row number. - virtual void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const = 0; - /// Deserialize one value and insert into a column. - /// If method will throw an exception, then column will be in same state as before call to method. - virtual void deserializeBinary(IColumn & column, ReadBuffer & istr) const = 0; - - /** Text serialization with escaping but without quoting. - */ - void serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - - void deserializeAsTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization as a literal that may be inserted into a query. - */ - void serializeAsTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - - void deserializeAsTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization for the CSV format. - */ - void serializeAsTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - void deserializeAsTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization for displaying on a terminal or saving into a text file, and the like. - * Without escaping or quoting. - */ - void serializeAsText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - - /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. - */ - void deserializeAsWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization intended for using in JSON format. - */ - void serializeAsTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - void deserializeAsTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization for putting into the XML format. - */ - void serializeAsTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + void enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback, ISerialization::SubstreamPath & path) const; + void enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback, ISerialization::SubstreamPath && path) const { enumerateStreams(serialization, callback, path); } + void enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback) const { enumerateStreams(serialization, callback, {}); } protected: virtual String doGetName() const; + virtual SerializationPtr doGetDefaultSerialization() const = 0; - virtual void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const - { - callback(path, *this); - } - - virtual void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & /*settings*/, - SerializeBinaryBulkStatePtr & /*state*/) const {} - - virtual void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & /*settings*/, - SerializeBinaryBulkStatePtr & /*state*/) const {} - - virtual void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & /*settings*/, - DeserializeBinaryBulkStatePtr & /*state*/) const {} - - virtual void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & /*state*/) const - { - if (WriteBuffer * stream = settings.getter(settings.path)) - serializeBinaryBulk(column, *stream, offset, limit); - } - - virtual void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const; - - /// Default implementations of text serialization in case of 'custom_text_serialization' is not set. - - virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const - { - serializeText(column, row_num, ostr, settings); - } + DataTypePtr getTypeForSubstream(const ISerialization::SubstreamPath & substream_path) const; public: - static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); - static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); - /** Create empty column for corresponding type. */ virtual MutableColumnPtr createColumn() const = 0; @@ -357,7 +135,6 @@ public: /// Checks that two instances belong to the same type virtual bool equals(const IDataType & rhs) const = 0; - /// Various properties on behaviour of data type. /** The data type is dependent on parameters and types with different parameters are different. @@ -483,27 +260,20 @@ public: /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); - static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); - static String getSubcolumnNameForStream(const SubstreamPath & path); - - /// Substream path supports special compression methods like codec Delta. - /// For all other substreams (like ArraySizes, NullMasks, etc.) we use only - /// generic compression codecs like LZ4. - static bool isSpecialCompressionAllowed(const SubstreamPath & path); protected: friend class DataTypeFactory; friend class AggregateFunctionSimpleState; + /// Customize this DataType void setCustomization(DataTypeCustomDescPtr custom_desc_) const; /// This is mutable to allow setting custom name and serialization on `const IDataType` post construction. mutable DataTypeCustomNamePtr custom_name; - mutable DataTypeCustomTextSerializationPtr custom_text_serialization; - mutable DataTypeCustomStreamsPtr custom_streams; + mutable SerializationPtr custom_serialization; public: const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } - const IDataTypeCustomStreams * getCustomStreams() const { return custom_streams.get(); } + const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } }; diff --git a/src/DataTypes/IDataTypeDummy.h b/src/DataTypes/IDataTypeDummy.h index 08cc0778a6e0157aeeae2d6c88e20eb4523ff52b..ca522d1c9b47a86675e4b35079699c108fc7f164 100644 --- a/src/DataTypes/IDataTypeDummy.h +++ b/src/DataTypes/IDataTypeDummy.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -17,7 +17,7 @@ namespace ErrorCodes * * That is, this class is used just to distinguish the corresponding data type from the others. */ -class IDataTypeDummy : public DataTypeWithSimpleSerialization +class IDataTypeDummy : public IDataType { private: [[noreturn]] void throwNoSerialization() const @@ -26,15 +26,6 @@ private: } public: - void serializeBinary(const Field &, WriteBuffer &) const override { throwNoSerialization(); } - void deserializeBinary(Field &, ReadBuffer &) const override { throwNoSerialization(); } - void serializeBinary(const IColumn &, size_t, WriteBuffer &) const override { throwNoSerialization(); } - void deserializeBinary(IColumn &, ReadBuffer &) const override { throwNoSerialization(); } - void serializeBinaryBulk(const IColumn &, WriteBuffer &, size_t, size_t) const override { throwNoSerialization(); } - void deserializeBinaryBulk(IColumn &, ReadBuffer &, size_t, double) const override { throwNoSerialization(); } - void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } - void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } - MutableColumnPtr createColumn() const override { throw Exception("Method createColumn() is not implemented for data type " + getName(), ErrorCodes::NOT_IMPLEMENTED); @@ -52,6 +43,8 @@ public: bool haveSubtypes() const override { return false; } bool cannotBeStoredInTables() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { throwNoSerialization(); } }; } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ab2e8e1958bac02f9812562e9065ae78cabb5264 --- /dev/null +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int MULTIPLE_STREAMS_REQUIRED; +} + +String ISerialization::Substream::toString() const +{ + switch (type) + { + case ArrayElements: + return "ArrayElements"; + case ArraySizes: + return "ArraySizes"; + case NullableElements: + return "NullableElements"; + case NullMap: + return "NullMap"; + case TupleElement: + return "TupleElement(" + tuple_element_name + ", " + + std::to_string(escape_tuple_delimiter) + ")"; + case DictionaryKeys: + return "DictionaryKeys"; + case DictionaryIndexes: + return "DictionaryIndexes"; + case SparseElements: + return "SparseElements"; + case SparseOffsets: + return "SparseOffsets"; + } + + __builtin_unreachable(); +} + +String ISerialization::SubstreamPath::toString() const +{ + WriteBufferFromOwnString wb; + wb << "{"; + for (size_t i = 0; i < size(); ++i) + { + if (i != 0) + wb << ", "; + wb << at(i).toString(); + } + wb << "}"; + return wb.str(); +} + +void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + callback(path); +} + +void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const +{ + throw Exception(ErrorCodes::MULTIPLE_STREAMS_REQUIRED, "Column {} must be serialized with multiple streams", column.getName()); +} + +void ISerialization::deserializeBinaryBulk(IColumn & column, ReadBuffer &, size_t, double) const +{ + throw Exception(ErrorCodes::MULTIPLE_STREAMS_REQUIRED, "Column {} must be deserialized with multiple streams", column.getName()); +} + +void ISerialization::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & /* state */) const +{ + if (WriteBuffer * stream = settings.getter(settings.path)) + serializeBinaryBulk(column, *stream, offset, limit); +} + +void ISerialization::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & /* state */, + SubstreamsCache * cache) const +{ + auto cached_column = getFromSubstreamsCache(cache, settings.path); + if (cached_column) + { + column = cached_column; + } + else if (ReadBuffer * stream = settings.getter(settings.path)) + { + auto mutable_column = column->assumeMutable(); + deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); + column = std::move(mutable_column); + addToSubstreamsCache(cache, settings.path, column); + } +} + +static String getNameForSubstreamPath( + String stream_name, + const ISerialization::SubstreamPath & path, + bool escape_tuple_delimiter) +{ + using Substream = ISerialization::Substream; + + size_t array_level = 0; + for (const auto & elem : path) + { + if (elem.type == Substream::NullMap) + stream_name += ".null"; + else if (elem.type == Substream::ArraySizes) + stream_name += ".size" + toString(array_level); + else if (elem.type == Substream::ArrayElements) + ++array_level; + else if (elem.type == Substream::DictionaryKeys) + stream_name += ".dict"; + else if (elem.type == Substream::SparseOffsets) + stream_name += ".sparse.idx"; + else if (elem.type == Substream::TupleElement) + { + /// For compatibility reasons, we use %2E (escaped dot) instead of dot. + /// Because nested data may be represented not by Array of Tuple, + /// but by separate Array columns with names in a form of a.b, + /// and name is encoded as a whole. + stream_name += (escape_tuple_delimiter && elem.escape_tuple_delimiter ? + escapeForFileName(".") : ".") + escapeForFileName(elem.tuple_element_name); + } + } + + return stream_name; +} + +String ISerialization::getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path) +{ + return getFileNameForStream(column.getNameInStorage(), path); +} + +String ISerialization::getFileNameForStream(const String & name_in_storage, const SubstreamPath & path) +{ + String stream_name; + auto nested_storage_name = Nested::extractTableName(name_in_storage); + if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == ISerialization::Substream::ArraySizes)) + stream_name = escapeForFileName(nested_storage_name); + else + stream_name = escapeForFileName(name_in_storage); + + return getNameForSubstreamPath(std::move(stream_name), path, true); +} + +String ISerialization::getSubcolumnNameForStream(const SubstreamPath & path) +{ + auto subcolumn_name = getNameForSubstreamPath("", path, false); + if (!subcolumn_name.empty()) + subcolumn_name = subcolumn_name.substr(1); // It starts with a dot. + + return subcolumn_name; +} + +void ISerialization::addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column) +{ + if (cache && !path.empty()) + cache->emplace(getSubcolumnNameForStream(path), column); +} + +ColumnPtr ISerialization::getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path) +{ + if (!cache || path.empty()) + return nullptr; + + auto it = cache->find(getSubcolumnNameForStream(path)); + if (it == cache->end()) + return nullptr; + + return it->second; +} + +bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) +{ + for (const auto & elem : path) + { + if (elem.type == Substream::NullMap + || elem.type == Substream::ArraySizes + || elem.type == Substream::DictionaryIndexes + || elem.type == Substream::SparseOffsets) + return false; + } + return true; +} + +} diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h new file mode 100644 index 0000000000000000000000000000000000000000..03785fc07f4e01833d34289ed8e65c3d8d8362f0 --- /dev/null +++ b/src/DataTypes/Serializations/ISerialization.h @@ -0,0 +1,259 @@ +#pragma once + +#include +#include + +#include +#include + +namespace DB +{ + +class IDataType; + +class ReadBuffer; +class WriteBuffer; +class ProtobufReader; +class ProtobufWriter; + +class IColumn; +using ColumnPtr = COW::Ptr; +using MutableColumnPtr = COW::MutablePtr; + +class Field; + +struct FormatSettings; +struct NameAndTypePair; + +class ISerialization +{ +public: + ISerialization() = default; + virtual ~ISerialization() = default; + + /** Binary serialization for range of values in column - for writing to disk/network, etc. + * + * Some data types are represented in multiple streams while being serialized. + * Example: + * - Arrays are represented as stream of all elements and stream of array sizes. + * - Nullable types are represented as stream of values (with unspecified values in place of NULLs) and stream of NULL flags. + * + * Different streams are identified by "path". + * If the data type require single stream (it's true for most of data types), the stream will have empty path. + * Otherwise, the path can have components like "array elements", "array sizes", etc. + * + * For multidimensional arrays, path can have arbitrary length. + * As an example, for 2-dimensional arrays of numbers we have at least three streams: + * - array sizes; (sizes of top level arrays) + * - array elements / array sizes; (sizes of second level (nested) arrays) + * - array elements / array elements; (the most deep elements, placed contiguously) + * + * Descendants must override either serializeBinaryBulk, deserializeBinaryBulk methods (for simple cases with single stream) + * or serializeBinaryBulkWithMultipleStreams, deserializeBinaryBulkWithMultipleStreams, enumerateStreams methods (for cases with multiple streams). + * + * Default implementations of ...WithMultipleStreams methods will call serializeBinaryBulk, deserializeBinaryBulk for single stream. + */ + + struct Substream + { + enum Type + { + ArrayElements, + ArraySizes, + + NullableElements, + NullMap, + + TupleElement, + + DictionaryKeys, + DictionaryIndexes, + + SparseElements, + SparseOffsets, + }; + Type type; + + /// Index of tuple element, starting at 1 or name. + String tuple_element_name; + + /// Do we need to escape a dot in filenames for tuple elements. + bool escape_tuple_delimiter = true; + + Substream(Type type_) : type(type_) {} + + String toString() const; + }; + + struct SubstreamPath : public std::vector + { + String toString() const; + }; + + /// Cache for common substreams of one type, but possible different its subcolumns. + /// E.g. sizes of arrays of Nested data type. + using SubstreamsCache = std::unordered_map; + + using StreamCallback = std::function; + + virtual void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const; + void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } + void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } + + using OutputStreamGetter = std::function; + using InputStreamGetter = std::function; + + struct SerializeBinaryBulkState + { + virtual ~SerializeBinaryBulkState() = default; + }; + + struct DeserializeBinaryBulkState + { + virtual ~DeserializeBinaryBulkState() = default; + }; + + using SerializeBinaryBulkStatePtr = std::shared_ptr; + using DeserializeBinaryBulkStatePtr = std::shared_ptr; + + struct SerializeBinaryBulkSettings + { + OutputStreamGetter getter; + SubstreamPath path; + + size_t low_cardinality_max_dictionary_size = 0; + bool low_cardinality_use_single_dictionary_for_part = true; + + bool position_independent_encoding = true; + }; + + struct DeserializeBinaryBulkSettings + { + InputStreamGetter getter; + SubstreamPath path; + + /// True if continue reading from previous positions in file. False if made fseek to the start of new granule. + bool continuous_reading = true; + + bool position_independent_encoding = true; + /// If not zero, may be used to avoid reallocations while reading column of String type. + double avg_value_size_hint = 0; + }; + + /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. + virtual void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & /*settings*/, + SerializeBinaryBulkStatePtr & /*state*/) const {} + + /// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization. + virtual void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & /*settings*/, + SerializeBinaryBulkStatePtr & /*state*/) const {} + + /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. + virtual void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & /*settings*/, + DeserializeBinaryBulkStatePtr & /*state*/) const {} + + /** 'offset' and 'limit' are used to specify range. + * limit = 0 - means no limit. + * offset must be not greater than size of column. + * offset + limit could be greater than size of column + * - in that case, column is serialized till the end. + */ + virtual void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const; + + /// Read no more than limit values and append them into column. + virtual void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const; + + /** Override these methods for data types that require just single stream (most of data types). + */ + virtual void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const; + virtual void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const; + + /** Serialization/deserialization of individual values. + * + * These are helper methods for implementation of various formats to input/output for user (like CSV, JSON, etc.). + * There is no one-to-one correspondence between formats and these methods. + * For example, TabSeparated and Pretty formats could use same helper method serializeTextEscaped. + * + * For complex data types (like arrays) binary serde for individual values may differ from bulk serde. + * For example, if you serialize single array, it will be represented as its size and elements in single contiguous stream, + * but if you bulk serialize column with arrays, then sizes and elements will be written to separate streams. + */ + + /// There is two variants for binary serde. First variant work with Field. + virtual void serializeBinary(const Field & field, WriteBuffer & ostr) const = 0; + virtual void deserializeBinary(Field & field, ReadBuffer & istr) const = 0; + + /// Other variants takes a column, to avoid creating temporary Field object. + /// Column must be non-constant. + + /// Serialize one value of a column at specified row number. + virtual void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const = 0; + /// Deserialize one value and insert into a column. + /// If method will throw an exception, then column will be in same state as before call to method. + virtual void deserializeBinary(IColumn & column, ReadBuffer & istr) const = 0; + + /** Text serialization with escaping but without quoting. + */ + virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization as a literal that may be inserted into a query. + */ + virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for the CSV format. + */ + virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for displaying on a terminal or saving into a text file, and the like. + * Without escaping or quoting. + */ + virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. + */ + virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization intended for using in JSON format. + */ + virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for putting into the XML format. + */ + virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const + { + serializeText(column, row_num, ostr, settings); + } + + static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); + static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path); + static String getSubcolumnNameForStream(const SubstreamPath & path); + + static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); + static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); + + static bool isSpecialCompressionAllowed(const SubstreamPath & path); +}; + +using SerializationPtr = std::shared_ptr; +using Serializations = std::vector; + +} diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e0bcb65d895a9faec1f55dd4c916e82805af1348 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -0,0 +1,221 @@ +#include + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +void SerializationAggregateFunction::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const String & s = get(field); + writeVarUInt(s.size(), ostr); + writeString(s, ostr); +} + +void SerializationAggregateFunction::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + UInt64 size; + readVarUInt(size, istr); + field = String(); + String & s = get(field); + s.resize(size); + istr.readStrict(s.data(), size); +} + +void SerializationAggregateFunction::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + function->serialize(assert_cast(column).getData()[row_num], ostr); +} + +void SerializationAggregateFunction::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnAggregateFunction & column_concrete = assert_cast(column); + + Arena & arena = column_concrete.createOrGetArena(); + size_t size_of_state = function->sizeOfData(); + AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); + + function->create(place); + try + { + function->deserialize(place, istr, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + column_concrete.getData().push_back(place); +} + +void SerializationAggregateFunction::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnAggregateFunction & real_column = typeid_cast(column); + const ColumnAggregateFunction::Container & vec = real_column.getData(); + + ColumnAggregateFunction::Container::const_iterator it = vec.begin() + offset; + ColumnAggregateFunction::Container::const_iterator end = limit ? it + limit : vec.end(); + + if (end > vec.end()) + end = vec.end(); + + for (; it != end; ++it) + function->serialize(*it, ostr); +} + +void SerializationAggregateFunction::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + ColumnAggregateFunction & real_column = typeid_cast(column); + ColumnAggregateFunction::Container & vec = real_column.getData(); + + Arena & arena = real_column.createOrGetArena(); + real_column.set(function); + vec.reserve(vec.size() + limit); + + size_t size_of_state = function->sizeOfData(); + size_t align_of_state = function->alignOfData(); + + for (size_t i = 0; i < limit; ++i) + { + if (istr.eof()) + break; + + AggregateDataPtr place = arena.alignedAlloc(size_of_state, align_of_state); + + function->create(place); + + try + { + function->deserialize(place, istr, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + vec.push_back(place); + } +} + +static String serializeToString(const AggregateFunctionPtr & function, const IColumn & column, size_t row_num) +{ + WriteBufferFromOwnString buffer; + function->serialize(assert_cast(column).getData()[row_num], buffer); + return buffer.str(); +} + +static void deserializeFromString(const AggregateFunctionPtr & function, IColumn & column, const String & s) +{ + ColumnAggregateFunction & column_concrete = assert_cast(column); + + Arena & arena = column_concrete.createOrGetArena(); + size_t size_of_state = function->sizeOfData(); + AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); + + function->create(place); + + try + { + ReadBufferFromString istr(s); + function->deserialize(place, istr, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + column_concrete.getData().push_back(place); +} + +void SerializationAggregateFunction::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readEscapedString(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readQuotedStringWithSQLStyle(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readStringUntilEOF(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(serializeToString(function, column, row_num), ostr, settings); +} + + +void SerializationAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readJSONString(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSV(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + deserializeFromString(function, column, s); +} + +} diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.h b/src/DataTypes/Serializations/SerializationAggregateFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..58a7d52ffe703ec92a0d2237704656c12c83d3b6 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.h @@ -0,0 +1,43 @@ +#pragma once + +#include + +#include + + +namespace DB +{ + +class SerializationAggregateFunction final : public ISerialization +{ +private: + AggregateFunctionPtr function; + +public: + static constexpr bool is_parametric = true; + + SerializationAggregateFunction(const AggregateFunctionPtr & function_): function(function_) {} + + /// NOTE These two functions for serializing single values are incompatible with the functions below. + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp new file mode 100644 index 0000000000000000000000000000000000000000..70a72c51e78c53f5b8e2a09a6b138ef931abca9e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -0,0 +1,507 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_READ_ARRAY_FROM_TEXT; + extern const int LOGICAL_ERROR; +} + +void SerializationArray::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const Array & a = get(field); + writeVarUInt(a.size(), ostr); + for (size_t i = 0; i < a.size(); ++i) + { + nested->serializeBinary(a[i], ostr); + } +} + + +void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + size_t size; + readVarUInt(size, istr); + field = Array(size); + Array & arr = get(field); + for (size_t i = 0; i < size; ++i) + nested->deserializeBinary(arr[i], istr); +} + + +void SerializationArray::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + size_t size = next_offset - offset; + + writeVarUInt(size, ostr); + + const IColumn & nested_column = column_array.getData(); + for (size_t i = offset; i < next_offset; ++i) + nested->serializeBinary(nested_column, i, ostr); +} + + +void SerializationArray::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnArray & column_array = assert_cast(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t size; + readVarUInt(size, istr); + + IColumn & nested_column = column_array.getData(); + + size_t i = 0; + try + { + for (; i < size; ++i) + nested->deserializeBinary(nested_column, istr); + } + catch (...) + { + if (i) + nested_column.popBack(i); + throw; + } + + offsets.push_back(offsets.back() + size); +} + + +namespace +{ + void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) + { + const ColumnArray & column_array = typeid_cast(column); + const ColumnArray::Offsets & offset_values = column_array.getOffsets(); + size_t size = offset_values.size(); + + if (!size) + return; + + size_t end = limit && (offset + limit < size) + ? offset + limit + : size; + + ColumnArray::Offset prev_offset = offset_values[offset - 1]; + for (size_t i = offset; i < end; ++i) + { + ColumnArray::Offset current_offset = offset_values[i]; + writeIntBinary(current_offset - prev_offset, ostr); + prev_offset = current_offset; + } + } + + void deserializeArraySizesPositionIndependent(ColumnArray & column_array, ReadBuffer & istr, UInt64 limit) + { + ColumnArray::Offsets & offset_values = column_array.getOffsets(); + size_t initial_size = offset_values.size(); + offset_values.resize(initial_size + limit); + + size_t i = initial_size; + ColumnArray::Offset current_offset = initial_size ? offset_values[initial_size - 1] : 0; + while (i < initial_size + limit && !istr.eof()) + { + ColumnArray::Offset current_size = 0; + readIntBinary(current_size, istr); + current_offset += current_size; + offset_values[i] = current_offset; + ++i; + } + + offset_values.resize(i); + } + + ColumnPtr arraySizesToOffsets(const IColumn & column) + { + const auto & column_sizes = assert_cast(column); + MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); + + if (column_sizes.empty()) + return column_offsets; + + const auto & sizes_data = column_sizes.getData(); + auto & offsets_data = assert_cast(*column_offsets).getData(); + + offsets_data.resize(sizes_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = sizes_data.size(); i < size; ++i) + { + prev_offset += sizes_data[i]; + offsets_data[i] = prev_offset; + } + + return column_offsets; + } +} + +ColumnPtr arrayOffsetsToSizes(const IColumn & column) +{ + const auto & column_offsets = assert_cast(column); + MutableColumnPtr column_sizes = column_offsets.cloneEmpty(); + + if (column_offsets.empty()) + return column_sizes; + + const auto & offsets_data = column_offsets.getData(); + auto & sizes_data = assert_cast(*column_sizes).getData(); + + sizes_data.resize(offsets_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = offsets_data.size(); i < size; ++i) + { + auto current_offset = offsets_data[i]; + sizes_data[i] = current_offset - prev_offset; + prev_offset = current_offset; + } + + return column_sizes; +} + + +void SerializationArray::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + path.push_back(Substream::ArraySizes); + callback(path); + path.back() = Substream::ArrayElements; + nested->enumerateStreams(callback, path); + path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnArray & column_array = typeid_cast(column); + + /// First serialize array sizes. + settings.path.push_back(Substream::ArraySizes); + if (auto * stream = settings.getter(settings.path)) + { + if (settings.position_independent_encoding) + serializeArraySizesPositionIndependent(column, *stream, offset, limit); + else + SerializationNumber().serializeBinaryBulk(*column_array.getOffsetsPtr(), *stream, offset, limit); + } + + /// Then serialize contents of arrays. + settings.path.back() = Substream::ArrayElements; + const ColumnArray::Offsets & offset_values = column_array.getOffsets(); + + if (offset > offset_values.size()) + return; + + /** offset - from which array to write. + * limit - how many arrays should be written, or 0, if you write everything that is. + * end - up to which array the recorded piece ends. + * + * nested_offset - from which element of the innards to write. + * nested_limit - how many elements of the innards to write, or 0, if you write everything that is. + */ + + size_t end = std::min(offset + limit, offset_values.size()); + + size_t nested_offset = offset ? offset_values[offset - 1] : 0; + size_t nested_limit = limit + ? offset_values[end - 1] - nested_offset + : 0; + + if (limit == 0 || nested_limit) + nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), nested_offset, nested_limit, settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnArray & column_array = typeid_cast(*mutable_column); + settings.path.push_back(Substream::ArraySizes); + + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + column_array.getOffsetsPtr() = arraySizesToOffsets(*cached_column); + } + else if (auto * stream = settings.getter(settings.path)) + { + if (settings.position_independent_encoding) + deserializeArraySizesPositionIndependent(column_array, *stream, limit); + else + SerializationNumber().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0); + + addToSubstreamsCache(cache, settings.path, arrayOffsetsToSizes(column_array.getOffsetsColumn())); + } + + settings.path.back() = Substream::ArrayElements; + + ColumnArray::Offsets & offset_values = column_array.getOffsets(); + ColumnPtr & nested_column = column_array.getDataPtr(); + + /// Number of values corresponding with `offset_values` must be read. + size_t last_offset = offset_values.back(); + if (last_offset < nested_column->size()) + throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR); + size_t nested_limit = last_offset - nested_column->size(); + + /// Adjust value size hint. Divide it to the average array size. + settings.avg_value_size_hint = nested_limit ? settings.avg_value_size_hint / nested_limit * offset_values.size() : 0; + + nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state, cache); + + settings.path.pop_back(); + + /// Check consistency between offsets and elements subcolumns. + /// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER. + if (!nested_column->empty() && nested_column->size() != last_offset) + throw ParsingException("Cannot read all array values: read just " + toString(nested_column->size()) + " of " + toString(last_offset), + ErrorCodes::CANNOT_READ_ALL_DATA); + + column = std::move(mutable_column); +} + + +template +static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeChar('[', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + write_nested(nested_column, i); + } + writeChar(']', ostr); +} + + +template +static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) +{ + ColumnArray & column_array = assert_cast(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + + IColumn & nested_column = column_array.getData(); + + size_t size = 0; + + bool has_braces = false; + if (checkChar('[', istr)) + has_braces = true; + else if (!allow_unenclosed) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + + try + { + bool first = true; + while (!istr.eof() && *istr.position() != ']') + { + if (!first) + { + if (*istr.position() == ',') + ++istr.position(); + else + throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, + "Cannot read array from text, expected comma or end of array, found '{}'", + *istr.position()); + } + + first = false; + + skipWhitespaceIfAny(istr); + + if (*istr.position() == ']') + break; + + read_nested(nested_column); + ++size; + + skipWhitespaceIfAny(istr); + } + + if (has_braces) + assertChar(']', istr); + else /// If array is not enclosed in braces, we read until EOF. + assertEOF(istr); + } + catch (...) + { + if (size) + nested_column.popBack(size); + throw; + } + + offsets.push_back(offsets.back() + size); +} + + +void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const IColumn & nested_column, size_t i) + { + nested->serializeTextQuoted(nested_column, i, ostr, settings); + }); +} + + +void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, istr, settings); + }, false); +} + +void SerializationArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeChar('[', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + nested->serializeTextJSON(nested_column, i, ostr, settings); + } + writeChar(']', ostr); +} + + +void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](IColumn & nested_column) + { + nested->deserializeTextJSON(nested_column, istr, settings); + }, false); +} + + +void SerializationArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeCString("", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + writeCString("", ostr); + nested->serializeTextXML(nested_column, i, ostr, settings); + writeCString("", ostr); + } + writeCString("", ostr); +} + + +void SerializationArray::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + /// There is no good way to serialize an array in CSV. Therefore, we serialize it into a string, and then write the resulting string in CSV. + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); +} + + +void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + + if (settings.csv.input_format_arrays_as_nested_csv) + { + deserializeTextImpl(column, rb, + [&](IColumn & nested_column) + { + nested->deserializeTextCSV(nested_column, rb, settings); + }, true); + } + else + { + deserializeTextImpl(column, rb, + [&](IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, rb, settings); + }, true); + } +} + +} diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h new file mode 100644 index 0000000000000000000000000000000000000000..71037090a4837b0e47532c1a84ad427a0a9cac88 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -0,0 +1,69 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationArray final : public SimpleTextSerialization +{ +private: + SerializationPtr nested; + +public: + SerializationArray(const SerializationPtr & nested_) : nested(nested_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Streaming serialization of arrays is arranged in a special way: + * - elements placed in a row are written/read without array sizes; + * - the sizes are written/read in a separate stream, + * This is necessary, because when implementing nested structures, several arrays can have common sizes. + */ + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; +}; + +ColumnPtr arrayOffsetsToSizes(const IColumn & column); + +} diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9347c4f60f31e1af6b18064c26569c025d3bf626 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -0,0 +1,97 @@ +#include + +#include +#include +#include +#include + +namespace +{ + +using namespace DB; + +String serializeToString(const SerializationCustomSimpleText & domain, const IColumn & column, size_t row_num, const FormatSettings & settings) +{ + WriteBufferFromOwnString buffer; + domain.serializeText(column, row_num, buffer, settings); + + return buffer.str(); +} + +void deserializeFromString(const SerializationCustomSimpleText & domain, IColumn & column, const String & s, const FormatSettings & settings) +{ + ReadBufferFromString istr(s); + domain.deserializeText(column, istr, settings); +} + +} + +namespace DB +{ + +SerializationCustomSimpleText::SerializationCustomSimpleText(const SerializationPtr & nested_) + : SerializationWrapper(nested_) +{ +} + +void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readEscapedString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readQuotedString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeCSVString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readCSVString(str, istr, settings.csv); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); +} + +void SerializationCustomSimpleText::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readJSONString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); +} + +} diff --git a/src/DataTypes/DataTypeCustomSimpleTextSerialization.h b/src/DataTypes/Serializations/SerializationCustomSimpleText.h similarity index 92% rename from src/DataTypes/DataTypeCustomSimpleTextSerialization.h rename to src/DataTypes/Serializations/SerializationCustomSimpleText.h index d983b66eecc89bcbf2729cff1ce8ec51e465acb4..ae938b1104b92f1e2a93e0e61803a51065e068b0 100644 --- a/src/DataTypes/DataTypeCustomSimpleTextSerialization.h +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB { @@ -12,9 +12,11 @@ class IColumn; /** Simple IDataTypeCustomTextSerialization that uses serializeText/deserializeText * for all serialization and deserialization. */ -class DataTypeCustomSimpleTextSerialization : public IDataTypeCustomTextSerialization +class SerializationCustomSimpleText : public SerializationWrapper { public: + SerializationCustomSimpleText(const SerializationPtr & nested_); + // Methods that subclasses must override in order to get full serialization/deserialization support. virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override = 0; virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ee9110d360d821c63d67295fe757072e503273fa --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -0,0 +1,83 @@ +#include + +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + +void SerializationDate::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeDateText(DayNum(assert_cast(column).getData()[row_num]), ostr); +} + +void SerializationDate::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); +} + +void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + readDateText(x, istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDate::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDate::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + assertChar('\'', istr); + readDateText(x, istr); + assertChar('\'', istr); + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + assertChar('"', istr); + readDateText(x, istr); + assertChar('"', istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + LocalDate value; + readCSV(value, istr); + assert_cast(column).getData().push_back(value.getDayNum()); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h new file mode 100644 index 0000000000000000000000000000000000000000..099d7444c3d92d07443af0df85b2c0afabfbbd59 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationDate final : public SerializationNumber +{ +public: + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp new file mode 100644 index 0000000000000000000000000000000000000000..16e47601eeb0c0efd3d1905d4e95ef52883b8b47 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -0,0 +1,155 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace +{ + +inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + readDateTimeText(x, istr, time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffort: + parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); + return; + } +} + +} + +SerializationDateTime::SerializationDateTime( + const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_) + : time_zone(time_zone_), utc_time_zone(utc_time_zone_) +{ +} + +void SerializationDateTime::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto value = assert_cast(column).getData()[row_num]; + switch (settings.date_time_output_format) + { + case FormatSettings::DateTimeOutputFormat::Simple: + writeDateTimeText(value, ostr, time_zone); + return; + case FormatSettings::DateTimeOutputFormat::UnixTimestamp: + writeIntText(value, ostr); + return; + case FormatSettings::DateTimeOutputFormat::ISO: + writeDateTimeTextISO(value, ostr, utc_time_zone); + return; + } +} + +void SerializationDateTime::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDateTime::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); +} + +void SerializationDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + readText(x, istr, settings, time_zone, utc_time_zone); + if (x < 0) + x = 0; + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar('\'', istr); + } + else /// Just 1504193808 or 01504193808 + { + readIntText(x, istr); + } + if (x < 0) + x = 0; + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('"', istr)) + { + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar('"', istr); + } + else + { + readIntText(x, istr); + } + if (x < 0) + x = 0; + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + + if (istr.eof()) + throwReadAfterEOF(); + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++istr.position(); + + readText(x, istr, settings, time_zone, utc_time_zone); + + if (maybe_quote == '\'' || maybe_quote == '\"') + assertChar(maybe_quote, istr); + + if (x < 0) + x = 0; + + assert_cast(column).getData().push_back(x); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h new file mode 100644 index 0000000000000000000000000000000000000000..8cf57ddef89da99c4794d698809b0c10d7b134f7 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +class DateLUTImpl; + +namespace DB +{ + +class SerializationDateTime final : public SerializationNumber +{ +private: + const DateLUTImpl & time_zone; + const DateLUTImpl & utc_time_zone; + +public: + SerializationDateTime(const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} + diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9f3958faa4d0e2594ee26afa1d53c22e9f16f906 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -0,0 +1,151 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +SerializationDateTime64::SerializationDateTime64( + const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_, UInt32 scale_) + : SerializationDecimalBase(DecimalUtils::max_precision, scale_) + , time_zone(time_zone_), utc_time_zone(utc_time_zone_) +{ +} + +void SerializationDateTime64::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto value = assert_cast(column).getData()[row_num]; + switch (settings.date_time_output_format) + { + case FormatSettings::DateTimeOutputFormat::Simple: + writeDateTimeText(value, scale, ostr, time_zone); + return; + case FormatSettings::DateTimeOutputFormat::UnixTimestamp: + writeDateTimeUnixTimestamp(value, scale, ostr); + return; + case FormatSettings::DateTimeOutputFormat::ISO: + writeDateTimeTextISO(value, scale, ostr, utc_time_zone); + return; + } +} + +void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DateTime64 result = 0; + readDateTime64Text(result, scale, istr, time_zone); + assert_cast(column).getData().push_back(result); +} + +void SerializationDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); +} + +void SerializationDateTime64::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + readDateTime64Text(x, scale, istr, time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffort: + parseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); + return; + } +} + +void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar('\'', istr); + } + else /// Just 1504193808 or 01504193808 + { + readIntText(x, istr); + } + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('"', istr)) + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar('"', istr); + } + else + { + readIntText(x, istr); + } + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + + if (istr.eof()) + throwReadAfterEOF(); + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++istr.position(); + + readText(x, scale, istr, settings, time_zone, utc_time_zone); + + if (maybe_quote == '\'' || maybe_quote == '\"') + assertChar(maybe_quote, istr); + + assert_cast(column).getData().push_back(x); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h new file mode 100644 index 0000000000000000000000000000000000000000..c36649daef1985897d517ac90caddabd15e3da0a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +class DateLUTImpl; + +namespace DB +{ + +class SerializationDateTime64 final : public SerializationDecimalBase +{ +private: + const DateLUTImpl & time_zone; + const DateLUTImpl & utc_time_zone; + +public: + SerializationDateTime64(const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_, UInt32 scale_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimal.cpp b/src/DataTypes/Serializations/SerializationDecimal.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e0073c80aca00179c7a013bcb87ca174e53b858a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimal.cpp @@ -0,0 +1,74 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int DECIMAL_OVERFLOW; +} + +template +bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) +{ + UInt32 unread_scale = scale; + if (!tryReadDecimalText(istr, x, precision, unread_scale)) + return false; + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) + return false; + + return true; +} + +template +void SerializationDecimal::readText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) +{ + UInt32 unread_scale = scale; + if (csv) + readCSVDecimalText(istr, x, precision, unread_scale); + else + readDecimalText(istr, x, precision, unread_scale); + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); +} + +template +void SerializationDecimal::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + T value = assert_cast(column).getData()[row_num]; + writeText(value, this->scale, ostr); +} + +template +void SerializationDecimal::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + readText(x, istr); + assert_cast(column).getData().push_back(x); +} + +template +void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + readText(x, istr, true); + assert_cast(column).getData().push_back(x); +} + +template class SerializationDecimal; +template class SerializationDecimal; +template class SerializationDecimal; +template class SerializationDecimal; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimal.h b/src/DataTypes/Serializations/SerializationDecimal.h new file mode 100644 index 0000000000000000000000000000000000000000..dc193cdf0d3ab4df9dc4e3a27966fa822c7daf6c --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimal.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +namespace DB +{ + +template +class SerializationDecimal final : public SerializationDecimalBase +{ +public: + using typename SerializationDecimalBase::ColumnType; + + SerializationDecimal(UInt32 precision_, UInt32 scale_) + : SerializationDecimalBase(precision_, scale_) {} + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } + + static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); + static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimalBase.cpp b/src/DataTypes/Serializations/SerializationDecimalBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8557c9ff719ea0085393cf799a3b53c2a4645407 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimalBase.cpp @@ -0,0 +1,73 @@ +#include + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +template +void SerializationDecimalBase::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + FieldType x = get>(field); + writeBinary(x, ostr); +} + +template +void SerializationDecimalBase::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const FieldType & x = assert_cast(column).getElement(row_num); + writeBinary(x, ostr); +} + +template +void SerializationDecimalBase::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnType::Container & x = typeid_cast(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + ostr.write(reinterpret_cast(&x[offset]), sizeof(FieldType) * limit); +} + +template +void SerializationDecimalBase::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + typename FieldType::NativeType x; + readBinary(x, istr); + field = DecimalField(T(x), this->scale); +} + +template +void SerializationDecimalBase::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + typename FieldType::NativeType x; + readBinary(x, istr); + assert_cast(column).getData().push_back(FieldType(x)); +} + +template +void SerializationDecimalBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double) const +{ + typename ColumnType::Container & x = typeid_cast(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(FieldType) * limit); + x.resize(initial_size + size / sizeof(FieldType)); +} + +template class SerializationDecimalBase; +template class SerializationDecimalBase; +template class SerializationDecimalBase; +template class SerializationDecimalBase; +template class SerializationDecimalBase; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimalBase.h b/src/DataTypes/Serializations/SerializationDecimalBase.h new file mode 100644 index 0000000000000000000000000000000000000000..fd3dcb17e35a02ad97ac46178782d1bcae3c2f86 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimalBase.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + +namespace DB +{ + +template +class SerializationDecimalBase : public SimpleTextSerialization +{ +protected: + const UInt32 precision; + const UInt32 scale; + +public: + using FieldType = T; + using ColumnType = ColumnDecimal; + + SerializationDecimalBase(UInt32 precision_, UInt32 scale_) + : precision(precision_), scale(scale_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a1550e706087265f013a84cbf005bbe9d3c416b0 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -0,0 +1,112 @@ +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +template +void SerializationEnum::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.tsv.input_format_enum_as_number) + assert_cast(column).getData().push_back(readValue(istr)); + else + { + /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. + std::string field_name; + readEscapedString(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template +void SerializationEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + std::string field_name; + readQuotedStringWithSQLStyle(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name))); +} + +template +void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.tsv.input_format_enum_as_number) + assert_cast(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readString(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template +void SerializationEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr, settings); +} + +template +void SerializationEnum::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + if (!istr.eof() && *istr.position() != '"') + assert_cast(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readJSONString(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name))); + } +} + +template +void SerializationEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSVString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.csv.input_format_enum_as_number) + assert_cast(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readCSVString(field_name, istr, settings.csv); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template class SerializationEnum; +template class SerializationEnum; + +} diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h new file mode 100644 index 0000000000000000000000000000000000000000..dfa9e74c7a170400552fd1c48d53097dae83ee2a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +namespace DB +{ + +template +class SerializationEnum : public SerializationNumber, public EnumValues +{ +public: + using typename SerializationNumber::FieldType; + using typename SerializationNumber::ColumnType; + using typename EnumValues::Values; + + SerializationEnum(const Values & values_) : EnumValues(values_) {} + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + FieldType readValue(ReadBuffer & istr) const + { + FieldType x; + readText(x, istr); + return this->findByValue(x)->first; + } +}; + +} diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5c63631e2a374b547ffe08df55141e913d650b87 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -0,0 +1,203 @@ +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; + extern const int TOO_LARGE_STRING_SIZE; +} + +void SerializationFixedString::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const String & s = get(field); + ostr.write(s.data(), std::min(s.size(), n)); + if (s.size() < n) + for (size_t i = s.size(); i < n; ++i) + ostr.write(0); +} + + +void SerializationFixedString::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + field = String(); + String & s = get(field); + s.resize(n); + istr.readStrict(s.data(), n); +} + + +void SerializationFixedString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + ostr.write(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n); +} + + +void SerializationFixedString::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnFixedString::Chars & data = assert_cast(column).getChars(); + size_t old_size = data.size(); + data.resize(old_size + n); + try + { + istr.readStrict(reinterpret_cast(data.data() + old_size), n); + } + catch (...) + { + data.resize_assume_reserved(old_size); + throw; + } +} + + +void SerializationFixedString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + + size_t size = data.size() / n; + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&data[n * offset]), n * limit); +} + + +void SerializationFixedString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + + size_t initial_size = data.size(); + size_t max_bytes = limit * n; + data.resize(initial_size + max_bytes); + size_t read_bytes = istr.readBig(reinterpret_cast(&data[initial_size]), max_bytes); + + if (read_bytes % n != 0) + throw Exception("Cannot read all data of type FixedString. Bytes read:" + toString(read_bytes) + ". String size:" + toString(n) + ".", + ErrorCodes::CANNOT_READ_ALL_DATA); + + data.resize(initial_size + read_bytes); +} + + +void SerializationFixedString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n, ostr); +} + + +void SerializationFixedString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeAnyEscapedString<'\''>(pos, pos + n, ostr); +} + + +void SerializationFixedString::alignStringLength(size_t n, PaddedPODArray & data, size_t string_start) +{ + size_t length = data.size() - string_start; + if (length < n) + { + data.resize_fill(string_start + n); + } + else if (length > n) + { + data.resize_assume_reserved(string_start); + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large value for FixedString({})", n); + } +} + +template +static inline void read(const SerializationFixedString & self, IColumn & column, Reader && reader) +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + size_t prev_size = data.size(); + try + { + reader(data); + SerializationFixedString::alignStringLength(self.getN(), data, prev_size); + } + catch (...) + { + data.resize_assume_reserved(prev_size); + throw; + } +} + + +void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeAnyQuotedString<'\''>(pos, pos + n, ostr); +} + + +void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); +} + + +void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeJSONString(pos, pos + n, ostr, settings); +} + + +void SerializationFixedString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeXMLStringForTextElement(pos, pos + n, ostr); +} + + +void SerializationFixedString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeCSVString(pos, pos + n, ostr); +} + + +void SerializationFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); +} + + +} diff --git a/src/DataTypes/Serializations/SerializationFixedString.h b/src/DataTypes/Serializations/SerializationFixedString.h new file mode 100644 index 0000000000000000000000000000000000000000..82559d1080044309b50b0aa2dea25c6072f81bfa --- /dev/null +++ b/src/DataTypes/Serializations/SerializationFixedString.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationFixedString : public ISerialization +{ +private: + size_t n; + +public: + SerializationFixedString(size_t n_) : n(n_) {} + size_t getN() const { return n; } + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /// Makes sure that the length of a newly inserted string to `chars` is equal to getN(). + /// If the length is less than getN() the function will add zero characters up to getN(). + /// If the length is greater than getN() the function will throw an exception. + static void alignStringLength(size_t n, PaddedPODArray & data, size_t string_start); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationIP.cpp b/src/DataTypes/Serializations/SerializationIP.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ec49f960c77d143b1a2a47cd633ee9c095c43f9a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIP.cpp @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING; + extern const int ILLEGAL_COLUMN; +} + +SerializationIPv4::SerializationIPv4(const SerializationPtr & nested_) + : SerializationCustomSimpleText(nested_) +{ +} + +void SerializationIPv4::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const auto * col = checkAndGetColumn(&column); + if (!col) + { + throw Exception("IPv4 type can only serialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; + char * ptr = buffer; + formatIPv4(reinterpret_cast(&col->getData()[row_num]), ptr); + + ostr.write(buffer, strlen(buffer)); +} + +void SerializationIPv4::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ColumnUInt32 * col = typeid_cast(&column); + if (!col) + { + throw Exception("IPv4 type can only deserialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; + istr.read(buffer, sizeof(buffer) - 1); + UInt32 ipv4_value = 0; + if (!parseIPv4(buffer, reinterpret_cast(&ipv4_value))) + { + throw Exception("Invalid IPv4 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + } + + col->insert(ipv4_value); +} + +SerializationIPv6::SerializationIPv6(const SerializationPtr & nested_) + : SerializationCustomSimpleText(nested_) +{ +} +void SerializationIPv6::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const auto * col = checkAndGetColumn(&column); + if (!col) + { + throw Exception("IPv6 type domain can only serialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; + char * ptr = buffer; + formatIPv6(reinterpret_cast(col->getDataAt(row_num).data), ptr); + + ostr.write(buffer, strlen(buffer)); +} + +void SerializationIPv6::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ColumnFixedString * col = typeid_cast(&column); + if (!col) + { + throw Exception("IPv6 type domain can only deserialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; + istr.read(buffer, sizeof(buffer) - 1); + + std::string ipv6_value(IPV6_BINARY_LENGTH, '\0'); + if (!parseIPv6(buffer, reinterpret_cast(ipv6_value.data()))) + { + throw Exception("Invalid IPv6 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + } + + col->insertString(ipv6_value); +} + +} diff --git a/src/DataTypes/Serializations/SerializationIP.h b/src/DataTypes/Serializations/SerializationIP.h new file mode 100644 index 0000000000000000000000000000000000000000..f1f4d90aba53a280309edf07f1f17a6d666589ba --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIP.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationIPv4 final : public SerializationCustomSimpleText +{ +public: + SerializationIPv4(const SerializationPtr & nested_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +class SerializationIPv6 : public SerializationCustomSimpleText +{ +public: + SerializationIPv6(const SerializationPtr & nested_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31058cb6e570da7578582bed4e9ba58dcfa5928a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -0,0 +1,827 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace +{ + const ColumnLowCardinality & getColumnLowCardinality(const IColumn & column) + { + return typeid_cast(column); + } + + ColumnLowCardinality & getColumnLowCardinality(IColumn & column) + { + return typeid_cast(column); + } +} + +SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dictionary_type_) + : dictionary_type(dictionary_type_) + , dict_inner_serialization(removeNullable(dictionary_type_)->getDefaultSerialization()) +{ +} + +void SerializationLowCardinality::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + path.push_back(Substream::DictionaryKeys); + dict_inner_serialization->enumerateStreams(callback, path); + path.back() = Substream::DictionaryIndexes; + callback(path); + path.pop_back(); +} + +struct KeysSerializationVersion +{ + enum Value + { + /// Version is written at the start of . + /// Dictionary is written as number N and N keys after them. + /// Dictionary can be shared for continuous range of granules, so some marks may point to the same position. + /// Shared dictionary is stored in state and is read once. + SharedDictionariesWithAdditionalKeys = 1, + }; + + Value value; + + static void checkVersion(UInt64 version) + { + if (version != SharedDictionariesWithAdditionalKeys) + throw Exception("Invalid version for SerializationLowCardinality key column.", ErrorCodes::LOGICAL_ERROR); + } + + explicit KeysSerializationVersion(UInt64 version) : value(static_cast(version)) { checkVersion(version); } +}; + +/// Version is stored at the start of each granule. It's used to store indexes type and flags. +struct IndexesSerializationType +{ + using SerializationType = UInt64; + /// Need to read dictionary if it wasn't. + static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u; + /// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them. + static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u; + /// Need to update dictionary. It means that previous granule has different dictionary. + static constexpr SerializationType NeedUpdateDictionary = 1u << 10u; + + enum Type + { + TUInt8 = 0, + TUInt16, + TUInt32, + TUInt64, + }; + + Type type; + bool has_additional_keys; + bool need_global_dictionary; + bool need_update_dictionary; + + static constexpr SerializationType resetFlags(SerializationType type) + { + return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary)); + } + + static void checkType(SerializationType type) + { + UInt64 value = resetFlags(type); + if (value <= TUInt64) + return; + + throw Exception("Invalid type for SerializationLowCardinality index column.", ErrorCodes::LOGICAL_ERROR); + } + + void serialize(WriteBuffer & buffer) const + { + SerializationType val = type; + if (has_additional_keys) + val |= HasAdditionalKeysBit; + if (need_global_dictionary) + val |= NeedGlobalDictionaryBit; + if (need_update_dictionary) + val |= NeedUpdateDictionary; + writeIntBinary(val, buffer); + } + + void deserialize(ReadBuffer & buffer) + { + SerializationType val; + readIntBinary(val, buffer); + checkType(val); + has_additional_keys = (val & HasAdditionalKeysBit) != 0; + need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0; + need_update_dictionary = (val & NeedUpdateDictionary) != 0; + type = static_cast(resetFlags(val)); + } + + IndexesSerializationType(const IColumn & column, + bool has_additional_keys_, + bool need_global_dictionary_, + bool enumerate_dictionaries) + : has_additional_keys(has_additional_keys_) + , need_global_dictionary(need_global_dictionary_) + , need_update_dictionary(enumerate_dictionaries) + { + if (typeid_cast(&column)) + type = TUInt8; + else if (typeid_cast(&column)) + type = TUInt16; + else if (typeid_cast(&column)) + type = TUInt32; + else if (typeid_cast(&column)) + type = TUInt64; + else + throw Exception("Invalid Indexes column for IndexesSerializationType. Expected ColumnUInt*, got " + + column.getName(), ErrorCodes::LOGICAL_ERROR); + } + + DataTypePtr getDataType() const + { + if (type == TUInt8) + return std::make_shared(); + if (type == TUInt16) + return std::make_shared(); + if (type == TUInt32) + return std::make_shared(); + if (type == TUInt64) + return std::make_shared(); + + throw Exception("Can't create DataType from IndexesSerializationType.", ErrorCodes::LOGICAL_ERROR); + } + + IndexesSerializationType() = default; +}; + +struct SerializeStateLowCardinality : public ISerialization::SerializeBinaryBulkState +{ + KeysSerializationVersion key_version; + MutableColumnUniquePtr shared_dictionary; + + explicit SerializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} +}; + +struct DeserializeStateLowCardinality : public ISerialization::DeserializeBinaryBulkState +{ + KeysSerializationVersion key_version; + ColumnUniquePtr global_dictionary; + + IndexesSerializationType index_type; + ColumnPtr additional_keys; + ColumnPtr null_map; + UInt64 num_pending_rows = 0; + + /// If dictionary should be updated. + /// Can happen is some granules was skipped while reading from MergeTree. + /// We should store this flag in State because + /// in case of long block of empty arrays we may not need read dictionary at first reading. + bool need_update_dictionary = false; + + explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} +}; + +static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState( + ISerialization::SerializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR); + + auto * low_cardinality_state = typeid_cast(state.get()); + if (!low_cardinality_state) + { + auto & state_ref = *state; + throw Exception("Invalid SerializeBinaryBulkState for SerializationLowCardinality. Expected: " + + demangle(typeid(SerializeStateLowCardinality).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return low_cardinality_state; +} + +static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState( + ISerialization::DeserializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR); + + auto * low_cardinality_state = typeid_cast(state.get()); + if (!low_cardinality_state) + { + auto & state_ref = *state; + throw Exception("Invalid DeserializeBinaryBulkState for SerializationLowCardinality. Expected: " + + demangle(typeid(DeserializeStateLowCardinality).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return low_cardinality_state; +} + +void SerializationLowCardinality::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception("Got empty stream in SerializationLowCardinality::serializeBinaryBulkStatePrefix", + ErrorCodes::LOGICAL_ERROR); + + /// Write version and create SerializeBinaryBulkState. + UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys; + + writeIntBinary(key_version, *stream); + + state = std::make_shared(key_version); +} + +void SerializationLowCardinality::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size) + { + auto nested_column = low_cardinality_state->shared_dictionary->getNestedNotNullableColumn(); + + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception("Got empty stream in SerializationLowCardinality::serializeBinaryBulkStateSuffix", + ErrorCodes::LOGICAL_ERROR); + + UInt64 num_keys = nested_column->size(); + writeIntBinary(num_keys, *stream); + dict_inner_serialization->serializeBinaryBulk(*nested_column, *stream, 0, num_keys); + low_cardinality_state->shared_dictionary = nullptr; + } +} + +void SerializationLowCardinality::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + return; + + UInt64 keys_version; + readIntBinary(keys_version, *stream); + + state = std::make_shared(keys_version); +} + +namespace +{ + template + PaddedPODArray * getIndexesData(IColumn & indexes) + { + auto * column = typeid_cast *>(&indexes); + if (column) + return &column->getData(); + + return nullptr; + } + + struct IndexMapsWithAdditionalKeys + { + MutableColumnPtr dictionary_map; + MutableColumnPtr additional_keys_map; + }; + + template + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeysRef(PaddedPODArray & index, size_t dict_size) + { + PaddedPODArray copy(index.cbegin(), index.cend()); + + HashMap dict_map; + HashMap add_keys_map; + + for (auto val : index) + { + if (val < dict_size) + dict_map.insert({val, dict_map.size()}); + else + add_keys_map.insert({val, add_keys_map.size()}); + } + + auto dictionary_map = ColumnVector::create(dict_map.size()); + auto additional_keys_map = ColumnVector::create(add_keys_map.size()); + auto & dict_data = dictionary_map->getData(); + auto & add_keys_data = additional_keys_map->getData(); + + for (auto val : dict_map) + dict_data[val.second] = val.first; + + for (auto val : add_keys_map) + add_keys_data[val.second] = val.first - dict_size; + + for (auto & val : index) + val = val < dict_size ? dict_map[val] + : add_keys_map[val] + dict_map.size(); + + for (size_t i = 0; i < index.size(); ++i) + { + T expected = index[i] < dict_data.size() ? dict_data[index[i]] + : add_keys_data[index[i] - dict_data.size()] + dict_size; + if (expected != copy[i]) + throw Exception("Expected " + toString(expected) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR); + + } + + return {std::move(dictionary_map), std::move(additional_keys_map)}; + } + + template + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray & index, size_t dict_size) + { + T max_less_dict_size = 0; + T max_value = 0; + + auto size = index.size(); + if (size == 0) + return {ColumnVector::create(), ColumnVector::create()}; + + for (size_t i = 0; i < size; ++i) + { + auto val = index[i]; + if (val < dict_size) + max_less_dict_size = std::max(max_less_dict_size, val); + + max_value = std::max(max_value, val); + } + + auto map_size = UInt64(max_less_dict_size) + 1; + auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0; + PaddedPODArray map(map_size, 0); + PaddedPODArray overflow_map(overflow_map_size, 0); + + T zero_pos_value = 0; + T zero_pos_overflowed_value = 0; + UInt64 cur_pos = 0; + UInt64 cur_overflowed_pos = 0; + + for (size_t i = 0; i < size; ++i) + { + T val = index[i]; + if (val < dict_size) + { + if (cur_pos == 0) + { + zero_pos_value = val; + ++cur_pos; + } + else if (map[val] == 0 && val != zero_pos_value) + { + map[val] = cur_pos; + ++cur_pos; + } + } + else + { + T shifted_val = val - dict_size; + if (cur_overflowed_pos == 0) + { + zero_pos_overflowed_value = shifted_val; + ++cur_overflowed_pos; + } + else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value) + { + overflow_map[shifted_val] = cur_overflowed_pos; + ++cur_overflowed_pos; + } + } + } + + auto dictionary_map = ColumnVector::create(cur_pos); + auto additional_keys_map = ColumnVector::create(cur_overflowed_pos); + auto & dict_data = dictionary_map->getData(); + auto & add_keys_data = additional_keys_map->getData(); + + for (size_t i = 0; i < map_size; ++i) + if (map[i]) + dict_data[map[i]] = static_cast(i); + + for (size_t i = 0; i < overflow_map_size; ++i) + if (overflow_map[i]) + add_keys_data[overflow_map[i]] = static_cast(i); + + if (cur_pos) + dict_data[0] = zero_pos_value; + if (cur_overflowed_pos) + add_keys_data[0] = zero_pos_overflowed_value; + + for (size_t i = 0; i < size; ++i) + { + T & val = index[i]; + if (val < dict_size) + val = map[val]; + else + val = overflow_map[val - dict_size] + cur_pos; + } + + return {std::move(dictionary_map), std::move(additional_keys_map)}; + } + + /// Update column and return map with old indexes. + /// Let N is the number of distinct values which are less than max_size; + /// old_column - column before function call; + /// new_column - column after function call: + /// * if old_column[i] < max_size, than + /// dictionary_map[new_column[i]] = old_column[i] + /// * else + /// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size) + { + if (auto * data_uint8 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint8, dict_size); + else if (auto * data_uint16 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint16, dict_size); + else if (auto * data_uint32 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint32, dict_size); + else if (auto * data_uint64 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint64, dict_size); + else + throw Exception("Indexes column for mapIndexWithAdditionalKeys must be UInt, got" + column.getName(), + ErrorCodes::LOGICAL_ERROR); + } +} + +void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * keys_stream = settings.getter(settings.path); + settings.path.back() = Substream::DictionaryIndexes; + auto * indexes_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!keys_stream && !indexes_stream) + return; + + if (!keys_stream) + throw Exception("Got empty stream for SerializationLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); + + if (!indexes_stream) + throw Exception("Got empty stream for SerializationLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); + + const ColumnLowCardinality & low_cardinality_column = typeid_cast(column); + + auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); + auto & global_dictionary = low_cardinality_state->shared_dictionary; + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + bool need_update_dictionary = global_dictionary == nullptr; + if (need_update_dictionary) + global_dictionary = DataTypeLowCardinality::createColumnUnique(*dictionary_type); + + size_t max_limit = column.size() - offset; + limit = limit ? std::min(limit, max_limit) : max_limit; + + /// Do not write anything for empty column. (May happen while writing empty arrays.) + if (limit == 0) + return; + + auto sub_column = low_cardinality_column.cutAndCompact(offset, limit); + ColumnPtr positions = sub_column->getIndexesPtr(); + ColumnPtr keys = sub_column->getDictionary().getNestedColumn(); + + if (settings.low_cardinality_max_dictionary_size) + { + /// Insert used_keys into global dictionary and update sub_index. + auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(), + settings.low_cardinality_max_dictionary_size); + size_t max_size = settings.low_cardinality_max_dictionary_size + indexes_with_overflow.overflowed_keys->size(); + ColumnLowCardinality::Index(indexes_with_overflow.indexes->getPtr()).check(max_size); + + if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size) + throw Exception("Got dictionary with size " + toString(global_dictionary->size()) + + " but max dictionary size is " + toString(settings.low_cardinality_max_dictionary_size), + ErrorCodes::LOGICAL_ERROR); + + positions = indexes_with_overflow.indexes->index(*positions, 0); + keys = std::move(indexes_with_overflow.overflowed_keys); + + if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty()) + throw Exception("Has additional keys, but dict size is " + toString(global_dictionary->size()) + + " which is less then max dictionary size (" + toString(settings.low_cardinality_max_dictionary_size) + ")", + ErrorCodes::LOGICAL_ERROR); + } + + if (const auto * nullable_keys = checkAndGetColumn(*keys)) + keys = nullable_keys->getNestedColumnPtr(); + + bool need_additional_keys = !keys->empty(); + bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0; + bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part + && global_dictionary->size() >= settings.low_cardinality_max_dictionary_size; + + IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary); + index_version.serialize(*indexes_stream); + + if (need_write_dictionary) + { + const auto & nested_column = global_dictionary->getNestedNotNullableColumn(); + UInt64 num_keys = nested_column->size(); + writeIntBinary(num_keys, *keys_stream); + dict_inner_serialization->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys); + low_cardinality_state->shared_dictionary = nullptr; + } + + if (need_additional_keys) + { + UInt64 num_keys = keys->size(); + writeIntBinary(num_keys, *indexes_stream); + dict_inner_serialization->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys); + } + + UInt64 num_rows = positions->size(); + writeIntBinary(num_rows, *indexes_stream); + auto index_serialization = index_version.getDataType()->getDefaultSerialization(); + index_serialization->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows); +} + +void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * /* cache */) const +{ + auto mutable_column = column->assumeMutable(); + ColumnLowCardinality & low_cardinality_column = typeid_cast(*mutable_column); + + settings.path.push_back(Substream::DictionaryKeys); + auto * keys_stream = settings.getter(settings.path); + settings.path.back() = Substream::DictionaryIndexes; + auto * indexes_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!keys_stream && !indexes_stream) + return; + + if (!keys_stream) + throw Exception("Got empty stream for SerializationLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); + + if (!indexes_stream) + throw Exception("Got empty stream for SerializationLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); + + auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state); + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + auto read_dictionary = [this, low_cardinality_state, keys_stream]() + { + UInt64 num_keys; + readIntBinary(num_keys, *keys_stream); + + auto keys_type = removeNullable(dictionary_type); + auto global_dict_keys = keys_type->createColumn(); + dict_inner_serialization->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0); + + auto column_unique = DataTypeLowCardinality::createColumnUnique(*dictionary_type, std::move(global_dict_keys)); + low_cardinality_state->global_dictionary = std::move(column_unique); + }; + + auto read_additional_keys = [this, low_cardinality_state, indexes_stream]() + { + UInt64 num_keys; + readIntBinary(num_keys, *indexes_stream); + auto keys_type = removeNullable(dictionary_type); + auto additional_keys = keys_type->createColumn(); + dict_inner_serialization->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0); + low_cardinality_state->additional_keys = std::move(additional_keys); + + if (!low_cardinality_state->index_type.need_global_dictionary && dictionary_type->isNullable()) + { + auto null_map = ColumnUInt8::create(num_keys, 0); + if (num_keys) + null_map->getElement(0) = 1; + + low_cardinality_state->null_map = std::move(null_map); + } + }; + + auto read_indexes = [this, low_cardinality_state, indexes_stream, &low_cardinality_column](UInt64 num_rows) + { + auto indexes_type = low_cardinality_state->index_type.getDataType(); + MutableColumnPtr indexes_column = indexes_type->createColumn(); + indexes_type->getDefaultSerialization()->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0); + + auto & global_dictionary = low_cardinality_state->global_dictionary; + const auto & additional_keys = low_cardinality_state->additional_keys; + + bool has_additional_keys = low_cardinality_state->index_type.has_additional_keys; + bool column_is_empty = low_cardinality_column.empty(); + + if (!low_cardinality_state->index_type.need_global_dictionary) + { + ColumnPtr keys_column = additional_keys; + if (low_cardinality_state->null_map) + keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map); + low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*keys_column, *indexes_column); + } + else if (!has_additional_keys) + { + if (column_is_empty) + low_cardinality_column.setSharedDictionary(global_dictionary); + + auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column)); + low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows); + } + else + { + auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size()); + + ColumnLowCardinality::Index(maps.additional_keys_map->getPtr()).check(additional_keys->size()); + + ColumnLowCardinality::Index(indexes_column->getPtr()).check( + maps.dictionary_map->size() + maps.additional_keys_map->size()); + + auto used_keys = IColumn::mutate(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0)); + + if (!maps.additional_keys_map->empty()) + { + auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0); + + if (dictionary_type->isNullable()) + { + ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0); + used_add_keys = ColumnNullable::create(used_add_keys, null_map); + } + + used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size()); + } + + low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column); + } + }; + + if (!settings.continuous_reading) + { + low_cardinality_state->num_pending_rows = 0; + + /// Remember in state that some granules were skipped and we need to update dictionary. + low_cardinality_state->need_update_dictionary = true; + } + + while (limit) + { + if (low_cardinality_state->num_pending_rows == 0) + { + if (indexes_stream->eof()) + break; + + auto & index_type = low_cardinality_state->index_type; + auto & global_dictionary = low_cardinality_state->global_dictionary; + + index_type.deserialize(*indexes_stream); + + bool need_update_dictionary = + !global_dictionary || index_type.need_update_dictionary || low_cardinality_state->need_update_dictionary; + if (index_type.need_global_dictionary && need_update_dictionary) + { + read_dictionary(); + low_cardinality_state->need_update_dictionary = false; + } + + if (low_cardinality_state->index_type.has_additional_keys) + read_additional_keys(); + else + low_cardinality_state->additional_keys = nullptr; + + readIntBinary(low_cardinality_state->num_pending_rows, *indexes_stream); + } + + size_t num_rows_to_read = std::min(limit, low_cardinality_state->num_pending_rows); + read_indexes(num_rows_to_read); + limit -= num_rows_to_read; + low_cardinality_state->num_pending_rows -= num_rows_to_read; + } + + column = std::move(mutable_column); +} + +void SerializationLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + dictionary_type->getDefaultSerialization()->serializeBinary(field, ostr); +} +void SerializationLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + dictionary_type->getDefaultSerialization()->deserializeBinary(field, istr); +} + +void SerializationLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + serializeImpl(column, row_num, &ISerialization::serializeBinary, ostr); +} +void SerializationLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + deserializeImpl(column, &ISerialization::deserializeBinary, istr); +} + +void SerializationLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextEscaped, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextEscaped, istr, settings); +} + +void SerializationLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextQuoted, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextQuoted, istr, settings); +} + +void SerializationLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeWholeText, istr, settings); +} + +void SerializationLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextCSV, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextCSV, istr, settings); +} + +void SerializationLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeText, ostr, settings); +} + +void SerializationLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextJSON, ostr, settings); +} +void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); +} + +void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); +} + +template +void SerializationLowCardinality::serializeImpl( + const IColumn & column, size_t row_num, SerializationLowCardinality::SerializeFunctionPtr func, Args &&... args) const +{ + const auto & low_cardinality_column = getColumnLowCardinality(column); + size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(row_num); + auto serialization = dictionary_type->getDefaultSerialization(); + (serialization.get()->*func)(*low_cardinality_column.getDictionary().getNestedColumn(), unique_row_number, std::forward(args)...); +} + +template +void SerializationLowCardinality::deserializeImpl( + IColumn & column, SerializationLowCardinality::DeserializeFunctionPtr func, Args &&... args) const +{ + auto & low_cardinality_column= getColumnLowCardinality(column); + auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + + auto serialization = dictionary_type->getDefaultSerialization(); + (serialization.get()->*func)(*temp_column, std::forward(args)...); + + low_cardinality_column.insertFromFullColumn(*temp_column, 0); +} + +} diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h new file mode 100644 index 0000000000000000000000000000000000000000..e9ca0349e38e61a54dd902dd39895aeeaf226d07 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -0,0 +1,81 @@ +#pragma once + +#include + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr; + +class SerializationLowCardinality : public ISerialization +{ +private: + DataTypePtr dictionary_type; + SerializationPtr dict_inner_serialization; + +public: + SerializationLowCardinality(const DataTypePtr & dictionary_type); + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + template + using SerializeFunctionPtr = void (ISerialization::*)(const IColumn &, size_t, Params ...) const; + + template + void serializeImpl(const IColumn & column, size_t row_num, SerializeFunctionPtr func, Args &&... args) const; + + template + using DeserializeFunctionPtr = void (ISerialization::*)(IColumn &, Params ...) const; + + template + void deserializeImpl(IColumn & column, DeserializeFunctionPtr func, Args &&... args) const; + + // template + // static MutableColumnUniquePtr createColumnUniqueImpl(const IDataType & keys_type, const Creator & creator); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp new file mode 100644 index 0000000000000000000000000000000000000000..26b473c9d0a15b77fc26cc6aff8756fe95f6fb21 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -0,0 +1,291 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_MAP_FROM_TEXT; +} + +SerializationMap::SerializationMap(const SerializationPtr & key_, const SerializationPtr & value_, const SerializationPtr & nested_) + : key(key_), value(value_), nested(nested_) +{ +} + +static const IColumn & extractNestedColumn(const IColumn & column) +{ + return assert_cast(column).getNestedColumn(); +} + +static IColumn & extractNestedColumn(IColumn & column) +{ + return assert_cast(column).getNestedColumn(); +} + +void SerializationMap::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const auto & map = get(field); + writeVarUInt(map.size(), ostr); + for (const auto & elem : map) + { + const auto & tuple = elem.safeGet(); + assert(tuple.size() == 2); + key->serializeBinary(tuple[0], ostr); + value->serializeBinary(tuple[1], ostr); + } +} + +void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + size_t size; + readVarUInt(size, istr); + field = Map(size); + for (auto & elem : field.get()) + { + Tuple tuple(2); + key->deserializeBinary(tuple[0], istr); + value->deserializeBinary(tuple[1], istr); + elem = std::move(tuple); + } +} + +void SerializationMap::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + nested->serializeBinary(extractNestedColumn(column), row_num, ostr); +} + +void SerializationMap::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + nested->deserializeBinary(extractNestedColumn(column), istr); +} + + +template +void SerializationMap::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const +{ + const auto & column_map = assert_cast(column); + + const auto & nested_array = column_map.getNestedColumn(); + const auto & nested_tuple = column_map.getNestedData(); + const auto & offsets = nested_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + writeChar('{', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + writer(key, nested_tuple.getColumn(0), i); + writeChar(':', ostr); + writer(value, nested_tuple.getColumn(1), i); + } + writeChar('}', ostr); +} + +template +void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const +{ + auto & column_map = assert_cast(column); + + auto & nested_array = column_map.getNestedColumn(); + auto & nested_tuple = column_map.getNestedData(); + auto & offsets = nested_array.getOffsets(); + + auto & key_column = nested_tuple.getColumn(0); + auto & value_column = nested_tuple.getColumn(1); + + size_t size = 0; + assertChar('{', istr); + + try + { + bool first = true; + while (!istr.eof() && *istr.position() != '}') + { + if (!first) + { + if (*istr.position() == ',') + ++istr.position(); + else + throw Exception("Cannot read Map from text", ErrorCodes::CANNOT_READ_MAP_FROM_TEXT); + } + + first = false; + + skipWhitespaceIfAny(istr); + + if (*istr.position() == '}') + break; + + reader(key, key_column); + skipWhitespaceIfAny(istr); + assertChar(':', istr); + + ++size; + skipWhitespaceIfAny(istr); + reader(value, value_column); + + skipWhitespaceIfAny(istr); + } + + offsets.push_back(offsets.back() + size); + assertChar('}', istr); + } + catch (...) + { + throw; + } +} + +void SerializationMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + subcolumn_serialization->serializeTextQuoted(subcolumn, pos, ostr, settings); + }); +} + +void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + + deserializeTextImpl(column, istr, + [&](const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + subcolumn_serialization->deserializeTextQuoted(subcolumn, istr, settings); + }); +} + + +void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + subcolumn_serialization->serializeTextJSON(subcolumn, pos, ostr, settings); + }); +} + +void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + subcolumn_serialization->deserializeTextJSON(subcolumn, istr, settings); + }); +} + +void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_map = assert_cast(column); + const auto & offsets = column_map.getNestedColumn().getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const auto & nested_data = column_map.getNestedData(); + + writeCString("", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + writeCString("", ostr); + writeCString("", ostr); + key->serializeTextXML(nested_data.getColumn(0), i, ostr, settings); + writeCString("", ostr); + + writeCString("", ostr); + value->serializeTextXML(nested_data.getColumn(1), i, ostr, settings); + writeCString("", ostr); + writeCString("", ostr); + } + writeCString("", ostr); +} + +void SerializationMap::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); +} + +void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + deserializeText(column, rb, settings); +} + + +void SerializationMap::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + nested->enumerateStreams(callback, path); +} + +void SerializationMap::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationMap::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkStateSuffix(settings, state); +} + +void SerializationMap::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + nested->deserializeBinaryBulkStatePrefix(settings, state); +} + + +void SerializationMap::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), offset, limit, settings, state); +} + +void SerializationMap::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto & column_map = assert_cast(*column->assumeMutable()); + nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache); +} + +} diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h new file mode 100644 index 0000000000000000000000000000000000000000..6f72d5c2594ede45125687446d4577a0438090d5 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -0,0 +1,71 @@ +#pragma once + +#include + + +namespace DB +{ + +class SerializationMap final : public SimpleTextSerialization +{ +private: + SerializationPtr key; + SerializationPtr value; + + /// 'nested' is an Array(Tuple(key_type, value_type)) + SerializationPtr nested; + +public: + SerializationMap(const SerializationPtr & key_type_, const SerializationPtr & value_type_, const SerializationPtr & nested_); + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + template + void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const; + + template + void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; +}; + +} + diff --git a/src/DataTypes/Serializations/SerializationNothing.cpp b/src/DataTypes/Serializations/SerializationNothing.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b11ea6d25243cb122cb3f6486e4812566a0219f --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNothing.cpp @@ -0,0 +1,25 @@ +#include +#include +#include +#include + +namespace DB +{ + +void SerializationNothing::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + size_t size = column.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + for (size_t i = 0; i < limit; ++i) + ostr.write('0'); +} + +void SerializationNothing::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typeid_cast(column).addSize(istr.tryIgnore(limit)); +} + +} diff --git a/src/DataTypes/Serializations/SerializationNothing.h b/src/DataTypes/Serializations/SerializationNothing.h new file mode 100644 index 0000000000000000000000000000000000000000..a7b26c117bc481ee09bbf9b9dec1e8cb660226b9 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNothing.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class SerializationNothing : public SimpleTextSerialization +{ +private: + [[noreturn]] void throwNoSerialization() const + { + throw Exception("Serialization is not implemented", ErrorCodes::NOT_IMPLEMENTED); + } +public: + void serializeBinary(const Field &, WriteBuffer &) const override { throwNoSerialization(); } + void deserializeBinary(Field &, ReadBuffer &) const override { throwNoSerialization(); } + void serializeBinary(const IColumn &, size_t, WriteBuffer &) const override { throwNoSerialization(); } + void deserializeBinary(IColumn &, ReadBuffer &) const override { throwNoSerialization(); } + void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + + /// These methods read and write zero bytes just to allow to figure out size of column. + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp new file mode 100644 index 0000000000000000000000000000000000000000..87805c53aa921e4d3d4bc9b059c38aa60a7b1a52 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -0,0 +1,476 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; +} + +void SerializationNullable::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + path.push_back(Substream::NullMap); + callback(path); + path.back() = Substream::NullableElements; + nested->enumerateStreams(callback, path); + path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnNullable & col = assert_cast(column); + col.checkConsistency(); + + /// First serialize null map. + settings.path.push_back(Substream::NullMap); + if (auto * stream = settings.getter(settings.path)) + SerializationNumber().serializeBinaryBulk(col.getNullMapColumn(), *stream, offset, limit); + + /// Then serialize contents of arrays. + settings.path.back() = Substream::NullableElements; + nested->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), offset, limit, settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnNullable & col = assert_cast(*mutable_column); + + settings.path.push_back(Substream::NullMap); + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + col.getNullMapColumnPtr() = cached_column; + } + else if (auto * stream = settings.getter(settings.path)) + { + SerializationNumber().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0); + addToSubstreamsCache(cache, settings.path, col.getNullMapColumnPtr()); + } + + settings.path.back() = Substream::NullableElements; + nested->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumnPtr(), limit, settings, state, cache); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + if (field.isNull()) + { + writeBinary(true, ostr); + } + else + { + writeBinary(false, ostr); + nested->serializeBinary(field, ostr); + } +} + +void SerializationNullable::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + bool is_null = false; + readBinary(is_null, istr); + if (!is_null) + { + nested->deserializeBinary(field, istr); + } + else + { + field = Null(); + } +} + +void SerializationNullable::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const ColumnNullable & col = assert_cast(column); + + bool is_null = col.isNullAt(row_num); + writeBinary(is_null, ostr); + if (!is_null) + nested->serializeBinary(col.getNestedColumn(), row_num, ostr); +} + +/// Deserialize value into ColumnNullable. +/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. +template , ReturnType>* = nullptr> +static ReturnType safeDeserialize( + IColumn & column, const ISerialization &, + CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +{ + ColumnNullable & col = assert_cast(column); + + if (check_for_null()) + { + col.insertDefault(); + } + else + { + deserialize_nested(col.getNestedColumn()); + + try + { + col.getNullMapData().push_back(0); + } + catch (...) + { + col.getNestedColumn().popBack(1); + throw; + } + } +} + +/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. +template , ReturnType>* = nullptr> +static ReturnType safeDeserialize( + IColumn & column, const ISerialization & nested, + CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +{ + assert(!dynamic_cast(&column)); + assert(!dynamic_cast(&nested)); + UNUSED(nested); + + bool insert_default = check_for_null(); + if (insert_default) + column.insertDefault(); + else + deserialize_nested(column); + return !insert_default; +} + + +void SerializationNullable::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + safeDeserialize(column, *nested, + [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, + [this, &istr] (IColumn & nested_column) { nested->deserializeBinary(nested_column, istr); }); +} + + +void SerializationNullable::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeString(settings.tsv.null_representation, ostr); + else + nested->serializeTextEscaped(col.getNestedColumn(), row_num, ostr, settings); +} + + +void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscapedImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + /// Little tricky, because we cannot discriminate null from first character. + + if (istr.eof()) + throw ParsingException("Unexpected end of stream, while parsing value of Nullable type", ErrorCodes::CANNOT_READ_ALL_DATA); + + /// This is not null, surely. + if (*istr.position() != '\\') + { + return safeDeserialize(column, *nested, + [] { return false; }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); + } + else + { + /// Now we know, that data in buffer starts with backslash. + ++istr.position(); + + if (istr.eof()) + throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA); + + return safeDeserialize(column, *nested, + [&istr] + { + if (*istr.position() == 'N') + { + ++istr.position(); + return true; + } + return false; + }, + [&nested, &istr, &settings] (IColumn & nested_column) + { + if (istr.position() != istr.buffer().begin()) + { + /// We could step back to consume backslash again. + --istr.position(); + nested->deserializeTextEscaped(nested_column, istr, settings); + } + else + { + /// Otherwise, we need to place backslash back in front of istr. + ReadBufferFromMemory prefix("\\", 1); + ConcatReadBuffer prepended_istr(prefix, istr); + + nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + + /// Synchronise cursor position in original buffer. + + if (prepended_istr.count() > 1) + istr.position() = prepended_istr.position(); + } + }); + } +} + +void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("NULL", ostr); + else + nested->serializeTextQuoted(col.getNestedColumn(), row_num, ostr, settings); +} + + +void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextQuotedImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize(column, *nested, + [&istr] + { + return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr); + }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); }); +} + + +void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeWholeTextImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize(column, *nested, + [&istr] + { + return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr) + || checkStringByFirstCharacterAndAssertTheRest("ᴺᵁᴸᴸ", istr); + }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeWholeText(nested_column, istr, settings); }); +} + + +void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("\\N", ostr); + else + nested->serializeTextCSV(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextCSVImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + constexpr char const * null_literal = "NULL"; + constexpr size_t len = 4; + size_t null_prefix_len = 0; + + auto check_for_null = [&istr, &settings, &null_prefix_len] + { + if (checkStringByFirstCharacterAndAssertTheRest("\\N", istr)) + return true; + if (!settings.csv.unquoted_null_literal_as_null) + return false; + + /// Check for unquoted NULL + while (!istr.eof() && null_prefix_len < len && null_literal[null_prefix_len] == *istr.position()) + { + ++null_prefix_len; + ++istr.position(); + } + if (null_prefix_len == len) + return true; + + /// Value and "NULL" have common prefix, but value is not "NULL". + /// Restore previous buffer position if possible. + if (null_prefix_len <= istr.offset()) + { + istr.position() -= null_prefix_len; + null_prefix_len = 0; + } + return false; + }; + + auto deserialize_nested = [&nested, &settings, &istr, &null_prefix_len] (IColumn & nested_column) + { + if (likely(!null_prefix_len)) + nested->deserializeTextCSV(nested_column, istr, settings); + else + { + /// Previous buffer position was not restored, + /// so we need to prepend extracted characters (rare case) + ReadBufferFromMemory prepend(null_literal, null_prefix_len); + ConcatReadBuffer buf(prepend, istr); + nested->deserializeTextCSV(nested_column, buf, settings); + + /// Check if all extracted characters were read by nested parser and update buffer position + if (null_prefix_len < buf.count()) + istr.position() = buf.position(); + else if (null_prefix_len > buf.count()) + { + /// It can happen only if there is an unquoted string instead of a number + /// or if someone uses 'U' or 'L' as delimiter in CSV. + /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L') + throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly " + "with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); + WriteBufferFromOwnString parsed_value; + nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); + throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len) + + std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable" + + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(null_literal, buf.count()) + + "\", which was deserialized as \"" + + parsed_value.str() + "\". It seems that input data is ill-formatted.", + ErrorCodes::CANNOT_READ_ALL_DATA); + } + } + }; + + return safeDeserialize(column, *nested, check_for_null, deserialize_nested); +} + +void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + /// In simple text format (like 'Pretty' format) (these formats are suitable only for output and cannot be parsed back), + /// data is printed without escaping. + /// It makes theoretically impossible to distinguish between NULL and some string value, regardless on how do we print NULL. + /// For this reason, we output NULL in a bit strange way. + /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. + + if (col.isNullAt(row_num)) + { + if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) + writeCString("ᴺᵁᴸᴸ", ostr); + else + writeCString("NULL", ostr); + } + else + nested->serializeText(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("null", ostr); + else + nested->serializeTextJSON(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize(column, *nested, + [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextJSON(nested_column, istr, settings); }); +} + +void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("\\N", ostr); + else + nested->serializeTextXML(col.getNestedColumn(), row_num, ostr, settings); +} + +template bool SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + +} diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h new file mode 100644 index 0000000000000000000000000000000000000000..b0b96c021d3feccd7d6f6f3aac7d9365ed6a3d7f --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -0,0 +1,85 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationNullable : public ISerialization +{ +private: + SerializationPtr nested; + +public: + SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {} + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /** It is questionable, how NULL values could be represented in CSV. There are three variants: + * 1. \N + * 2. empty string (without quotes) + * 3. NULL + * We support all of them (however, second variant is supported by CSVRowInputStream, not by deserializeTextCSV). + * (see also input_format_defaults_for_omitted_fields and input_format_csv_unquoted_null_literal_as_null settings) + * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. + */ + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) + /// If ReturnType is void, deserialize Nullable(T) + template + static ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + template + static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b0a91b11716d5ff20e6d0acdeb2f26b6cd8e038e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +template +void SerializationNumber::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeText(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationNumber::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + + if constexpr (is_integer_v && is_arithmetic_v) + readIntTextUnsafe(x, istr); + else + readText(x, istr); + + assert_cast &>(column).getData().push_back(x); +} + +template +static inline void writeDenormalNumber(T x, WriteBuffer & ostr) +{ + if constexpr (std::is_floating_point_v) + { + if (std::signbit(x)) + { + if (isNaN(x)) + writeCString("-nan", ostr); + else + writeCString("-inf", ostr); + } + else + { + if (isNaN(x)) + writeCString("nan", ostr); + else + writeCString("inf", ostr); + } + } + else + { + /// This function is not called for non floating point numbers. + (void)x; + } +} + + +template +void SerializationNumber::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto x = assert_cast &>(column).getData()[row_num]; + bool is_finite = isFinite(x); + + const bool need_quote = (is_integer_v && (sizeof(T) >= 8) && settings.json.quote_64bit_integers) + || (settings.json.quote_denormals && !is_finite); + + if (need_quote) + writeChar('"', ostr); + + if (is_finite) + writeText(x, ostr); + else if (!settings.json.quote_denormals) + writeCString("null", ostr); + else + writeDenormalNumber(x, ostr); + + if (need_quote) + writeChar('"', ostr); +} + +template +void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + bool has_quote = false; + if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. + { + has_quote = true; + ++istr.position(); + } + + FieldType x; + + /// null + if (!has_quote && !istr.eof() && *istr.position() == 'n') + { + ++istr.position(); + assertString("ull", istr); + + x = NaNOrZero(); + } + else + { + static constexpr bool is_uint8 = std::is_same_v; + static constexpr bool is_int8 = std::is_same_v; + + if (is_uint8 || is_int8) + { + // extra conditions to parse true/false strings into 1/0 + if (istr.eof()) + throwReadAfterEOF(); + if (*istr.position() == 't' || *istr.position() == 'f') + { + bool tmp = false; + readBoolTextWord(tmp, istr); + x = tmp; + } + else + readText(x, istr); + } + else + { + readText(x, istr); + } + + if (has_quote) + assertChar('"', istr); + } + + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationNumber::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + FieldType x; + readCSV(x, istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationNumber::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + /// ColumnVector::ValueType is a narrower type. For example, UInt8, when the Field type is UInt64 + typename ColumnVector::ValueType x = get(field); + writeBinary(x, ostr); +} + +template +void SerializationNumber::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + typename ColumnVector::ValueType x; + readBinary(x, istr); + field = NearestFieldType(x); +} + +template +void SerializationNumber::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + writeBinary(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationNumber::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + typename ColumnVector::ValueType x; + readBinary(x, istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationNumber::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&x[offset]), sizeof(typename ColumnVector::ValueType) * limit); +} + +template +void SerializationNumber::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(typename ColumnVector::ValueType) * limit); + x.resize(initial_size + size / sizeof(typename ColumnVector::ValueType)); +} + +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; // base for UUID +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; + +} diff --git a/src/DataTypes/Serializations/SerializationNumber.h b/src/DataTypes/Serializations/SerializationNumber.h new file mode 100644 index 0000000000000000000000000000000000000000..09976a4bc4f2ad456d8761c9cdbfaa71276a8117 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNumber.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +namespace DB +{ + +template +class SerializationNumber : public SimpleTextSerialization +{ + static_assert(IsNumber); + +public: + using FieldType = T; + using ColumnType = ColumnVector; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + /** Format is platform-dependent. */ + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3c24ed6749c41167658816f15d212a4dc20a55b --- /dev/null +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -0,0 +1,300 @@ +#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include + +#ifdef __SSE2__ + #include +#endif + +namespace DB +{ + +void SerializationString::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const String & s = get(field); + writeVarUInt(s.size(), ostr); + writeString(s, ostr); +} + + +void SerializationString::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + UInt64 size; + readVarUInt(size, istr); + field = String(); + String & s = get(field); + s.resize(size); + istr.readStrict(s.data(), size); +} + + +void SerializationString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const StringRef & s = assert_cast(column).getDataAt(row_num); + writeVarUInt(s.size, ostr); + writeString(s, ostr); +} + + +void SerializationString::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnString & column_string = assert_cast(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + + UInt64 size; + readVarUInt(size, istr); + + size_t old_chars_size = data.size(); + size_t offset = old_chars_size + size + 1; + offsets.push_back(offset); + + try + { + data.resize(offset); + istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); + data.back() = 0; + } + catch (...) + { + offsets.pop_back(); + data.resize_assume_reserved(old_chars_size); + throw; + } +} + + +void SerializationString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnString & column_string = typeid_cast(column); + const ColumnString::Chars & data = column_string.getChars(); + const ColumnString::Offsets & offsets = column_string.getOffsets(); + + size_t size = column.size(); + if (!size) + return; + + size_t end = limit && offset + limit < size + ? offset + limit + : size; + + if (offset == 0) + { + UInt64 str_size = offsets[0] - 1; + writeVarUInt(str_size, ostr); + ostr.write(reinterpret_cast(data.data()), str_size); + + ++offset; + } + + for (size_t i = offset; i < end; ++i) + { + UInt64 str_size = offsets[i] - offsets[i - 1] - 1; + writeVarUInt(str_size, ostr); + ostr.write(reinterpret_cast(&data[offsets[i - 1]]), str_size); + } +} + + +template +static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) +{ + size_t offset = data.size(); + for (size_t i = 0; i < limit; ++i) + { + if (istr.eof()) + break; + + UInt64 size; + readVarUInt(size, istr); + + offset += size + 1; + offsets.push_back(offset); + + data.resize(offset); + + if (size) + { +#ifdef __SSE2__ + /// An optimistic branch in which more efficient copying is possible. + if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) + { + const __m128i * sse_src_pos = reinterpret_cast(istr.position()); + const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES; + __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]); + + while (sse_src_pos < sse_src_end) + { + for (size_t j = 0; j < UNROLL_TIMES; ++j) + _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j)); + + sse_src_pos += UNROLL_TIMES; + sse_dst_pos += UNROLL_TIMES; + } + + istr.position() += size; + } + else +#endif + { + istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); + } + } + + data[offset - 1] = 0; + } +} + + +void SerializationString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + ColumnString & column_string = typeid_cast(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + + double avg_chars_size = 1; /// By default reserve only for empty strings. + + if (avg_value_size_hint && avg_value_size_hint > sizeof(offsets[0])) + { + /// Randomly selected. + constexpr auto avg_value_size_hint_reserve_multiplier = 1.2; + + avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier; + } + + size_t size_to_reserve = data.size() + std::ceil(limit * avg_chars_size); + + /// Never reserve for too big size. + if (size_to_reserve < 256 * 1024 * 1024) + { + try + { + data.reserve(size_to_reserve); + } + catch (Exception & e) + { + e.addMessage( + "(avg_value_size_hint = " + toString(avg_value_size_hint) + + ", avg_chars_size = " + toString(avg_chars_size) + + ", limit = " + toString(limit) + ")"); + throw; + } + } + + offsets.reserve(offsets.size() + limit); + + if (avg_chars_size >= 64) + deserializeBinarySSE2<4>(data, offsets, istr, limit); + else if (avg_chars_size >= 48) + deserializeBinarySSE2<3>(data, offsets, istr, limit); + else if (avg_chars_size >= 32) + deserializeBinarySSE2<2>(data, offsets, istr, limit); + else + deserializeBinarySSE2<1>(data, offsets, istr, limit); +} + + +void SerializationString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(assert_cast(column).getDataAt(row_num), ostr); +} + + +template +static inline void read(IColumn & column, Reader && reader) +{ + ColumnString & column_string = assert_cast(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + size_t old_chars_size = data.size(); + size_t old_offsets_size = offsets.size(); + try + { + reader(data); + data.push_back(0); + offsets.push_back(data.size()); + } + catch (...) + { + offsets.resize_assume_reserved(old_offsets_size); + data.resize_assume_reserved(old_chars_size); + throw; + } +} + + +void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); +} + + +void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); +} + + +void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); +} + + +void SerializationString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(assert_cast(column).getDataAt(row_num), ostr, settings); +} + + +void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); +} + + +void SerializationString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSVString<>(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); +} + + +} diff --git a/src/DataTypes/Serializations/SerializationString.h b/src/DataTypes/Serializations/SerializationString.h new file mode 100644 index 0000000000000000000000000000000000000000..ee5de2c18f1d785577cbe9e035f68e582e568449 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationString.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationString final : public ISerialization +{ +public: + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bdeea80477ecf23ea41fa484a327246d6a2eab21 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; + extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int LOGICAL_ERROR; +} + + +static inline IColumn & extractElementColumn(IColumn & column, size_t idx) +{ + return assert_cast(column).getColumn(idx); +} + +static inline const IColumn & extractElementColumn(const IColumn & column, size_t idx) +{ + return assert_cast(column).getColumn(idx); +} + +void SerializationTuple::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const auto & tuple = get(field); + for (const auto idx_elem : ext::enumerate(elems)) + idx_elem.second->serializeBinary(tuple[idx_elem.first], ostr); +} + +void SerializationTuple::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + const size_t size = elems.size(); + + Tuple tuple(size); + for (const auto i : ext::range(0, size)) + elems[i]->deserializeBinary(tuple[i], istr); + + field = tuple; +} + +void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + for (const auto idx_elem : ext::enumerate(elems)) + idx_elem.second->serializeBinary(extractElementColumn(column, idx_elem.first), row_num, ostr); +} + + +template +static void addElementSafe(size_t num_elems, IColumn & column, F && impl) +{ + /// We use the assumption that tuples of zero size do not exist. + size_t old_size = column.size(); + + try + { + impl(); + + // Check that all columns now have the same size. + size_t new_size = column.size(); + for (auto i : ext::range(1, num_elems)) + { + const auto & element_column = extractElementColumn(column, i); + if (element_column.size() != new_size) + { + // This is not a logical error because it may work with + // user-supplied data. + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + } + } + } + catch (...) + { + for (const auto & i : ext::range(0, num_elems)) + { + auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) + element_column.popBack(1); + } + + throw; + } +} + +void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + addElementSafe(elems.size(), column, [&] + { + for (const auto & i : ext::range(0, ext::size(elems))) + elems[i]->deserializeBinary(extractElementColumn(column, i), istr); + }); +} + +void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('(', ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar(')', ostr); +} + +void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + const size_t size = elems.size(); + assertChar('(', istr); + + addElementSafe(elems.size(), column, [&] + { + for (const auto i : ext::range(0, size)) + { + skipWhitespaceIfAny(istr); + if (i != 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); + } + }); + + // Special format for one element tuple (1,) + if (1 == elems.size()) + { + skipWhitespaceIfAny(istr); + // Allow both (1) and (1,) + checkChar(',', istr); + } + skipWhitespaceIfAny(istr); + assertChar(')', istr); +} + +void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + if (settings.json.named_tuples_as_objects + && have_explicit_names) + { + writeChar('{', ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + { + writeChar(',', ostr); + } + writeJSONString(elems[i]->getElementName(), ostr, settings); + writeChar(':', ostr); + elems[i]->serializeTextJSON(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar('}', ostr); + } + else + { + writeChar('[', ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextJSON(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar(']', ostr); + } +} + +void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.named_tuples_as_objects + && have_explicit_names) + { + skipWhitespaceIfAny(istr); + assertChar('{', istr); + skipWhitespaceIfAny(istr); + + addElementSafe(elems.size(), column, [&] + { + // Require all elements but in arbitrary order. + for (auto i : ext::range(0, ext::size(elems))) + { + if (i > 0) + { + skipWhitespaceIfAny(istr); + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + + std::string name; + readDoubleQuotedString(name, istr); + skipWhitespaceIfAny(istr); + assertChar(':', istr); + skipWhitespaceIfAny(istr); + + const size_t element_pos = getPositionByName(name); + auto & element_column = extractElementColumn(column, element_pos); + elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + } + }); + + skipWhitespaceIfAny(istr); + assertChar('}', istr); + } + else + { + const size_t size = elems.size(); + assertChar('[', istr); + + addElementSafe(elems.size(), column, [&] + { + for (const auto i : ext::range(0, size)) + { + skipWhitespaceIfAny(istr); + if (i != 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); + } + }); + + skipWhitespaceIfAny(istr); + assertChar(']', istr); + } +} + +void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeCString("", ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + writeCString("", ostr); + elems[i]->serializeTextXML(extractElementColumn(column, i), row_num, ostr, settings); + writeCString("", ostr); + } + writeCString("", ostr); +} + +void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings); + } +} + +void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (const auto i : ext::range(0, size)) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + assertChar(settings.csv.delimiter, istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); + } + }); +} + +void SerializationTuple::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + for (const auto & elem : elems) + elem->enumerateStreams(callback, path); +} + +struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + +static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(ISerialization::SerializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); + + auto * tuple_state = typeid_cast(state.get()); + if (!tuple_state) + { + auto & state_ref = *state; + throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: " + + demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return tuple_state; +} + +static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(ISerialization::DeserializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); + + auto * tuple_state = typeid_cast(state.get()); + if (!tuple_state) + { + auto & state_ref = *state; + throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: " + + demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return tuple_state; +} + +void SerializationTuple::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto tuple_state = std::make_shared(); + tuple_state->states.resize(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->serializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + + state = std::move(tuple_state); +} + +void SerializationTuple::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * tuple_state = checkAndGetTupleSerializeState(state); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]); +} + +void SerializationTuple::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto tuple_state = std::make_shared(); + tuple_state->states.resize(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + + state = std::move(tuple_state); +} + +void SerializationTuple::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * tuple_state = checkAndGetTupleSerializeState(state); + + for (const auto i : ext::range(0, ext::size(elems))) + { + const auto & element_col = extractElementColumn(column, i); + elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]); + } +} + +void SerializationTuple::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * tuple_state = checkAndGetTupleDeserializeState(state); + + auto mutable_column = column->assumeMutable(); + auto & column_tuple = assert_cast(*mutable_column); + + settings.avg_value_size_hint = 0; + for (const auto i : ext::range(0, ext::size(elems))) + elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache); +} + +size_t SerializationTuple::getPositionByName(const String & name) const +{ + size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + if (elems[i]->getElementName() == name) + return i; + throw Exception("Tuple doesn't have element with name '" + name + "'", ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); +} + +} diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h new file mode 100644 index 0000000000000000000000000000000000000000..13668572fffe8a99749123832373f08210e5726c --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationTuple final : public SimpleTextSerialization +{ +public: + using ElementSerializationPtr = std::shared_ptr; + using ElementSerializations = std::vector; + + SerializationTuple(const ElementSerializations & elems_, bool have_explicit_names_) + : elems(elems_), have_explicit_names(have_explicit_names_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Each sub-column in a tuple is serialized in separate stream. + */ + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + ElementSerializations elems; + bool have_explicit_names; + + size_t getPositionByName(const String & name) const; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationTupleElement.cpp b/src/DataTypes/Serializations/SerializationTupleElement.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b50810fcd699022cb0cfd316d6ad3ac86ef02dd --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTupleElement.cpp @@ -0,0 +1,73 @@ +#include + +namespace DB +{ + +void SerializationTupleElement::enumerateStreams( + const StreamCallback & callback, + SubstreamPath & path) const +{ + addToPath(path); + nested_serialization->enumerateStreams(callback, path); + path.pop_back(); +} + +void SerializationTupleElement::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); + settings.path.pop_back(); +} + +void SerializationTupleElement::addToPath(SubstreamPath & path) const +{ + path.push_back(Substream::TupleElement); + path.back().tuple_element_name = name; + path.back().escape_tuple_delimiter = escape_delimiter; +} + +} diff --git a/src/DataTypes/Serializations/SerializationTupleElement.h b/src/DataTypes/Serializations/SerializationTupleElement.h new file mode 100644 index 0000000000000000000000000000000000000000..b85014c9e6418b799438e076046fb96d81a9630b --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTupleElement.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationTupleElement final : public SerializationWrapper +{ +private: + String name; + bool escape_delimiter; + +public: + SerializationTupleElement(const SerializationPtr & nested_, const String & name_, bool escape_delimiter_ = true) + : SerializationWrapper(nested_) + , name(name_), escape_delimiter(escape_delimiter_) + { + } + + const String & getElementName() const { return name; } + + void enumerateStreams( + const StreamCallback & callback, + SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + void addToPath(SubstreamPath & path) const; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1a0640a5e69d1d11058839ebace2e9e07dcfee17 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +void SerializationUUID::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeText(UUID(assert_cast(column).getData()[row_num]), ostr); +} + +void SerializationUUID::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + readText(x, istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeText(column, istr, settings); +} + +void SerializationUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + assertChar('\'', istr); + readText(x, istr); + assertChar('\'', istr); + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + assertChar('"', istr); + readText(x, istr); + assertChar('"', istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID value; + readCSV(value, istr); + assert_cast(column).getData().push_back(value); +} + +} diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h new file mode 100644 index 0000000000000000000000000000000000000000..93bf166bbd97a913c40807f047d9a35cbe3c12e4 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationUUID : public SerializationNumber +{ +public: + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f75c9a1dd8b3a87e7d4ddc82d2c482c5088030ed --- /dev/null +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -0,0 +1,140 @@ +#include +#include + +namespace DB +{ + +void SerializationWrapper::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + nested_serialization->enumerateStreams(callback, path); +} + +void SerializationWrapper::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested_serialization->serializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationWrapper::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); +} + +void SerializationWrapper::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationWrapper::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); +} + +void SerializationWrapper::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); +} + +void SerializationWrapper::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + nested_serialization->serializeBinaryBulk(column, ostr, offset, limit); +} + +void SerializationWrapper::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + nested_serialization->deserializeBinaryBulk(column, istr, limit, avg_value_size_hint); +} + +void SerializationWrapper::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + nested_serialization->serializeBinary(field, ostr); +} + +void SerializationWrapper::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + nested_serialization->deserializeBinary(field, istr); +} + +void SerializationWrapper::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + nested_serialization->serializeBinary(column, row_num, ostr); +} + +void SerializationWrapper::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + nested_serialization->deserializeBinary(column, istr); +} + +void SerializationWrapper::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextEscaped(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextEscaped(column, istr, settings); +} + +void SerializationWrapper::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextQuoted(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextQuoted(column, istr, settings); +} + +void SerializationWrapper::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextCSV(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextCSV(column, istr, settings); +} + +void SerializationWrapper::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeText(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeWholeText(column, istr, settings); +} + +void SerializationWrapper::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextJSON(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextJSON(column, istr, settings); +} + +void SerializationWrapper::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextXML(column, row_num, ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..399d3b198b3d47ee97e6bf7fba9e57ecee28af29 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Wrapper for serialization, which calls methods, which are not overridden, from nested serialization. +/// You can inherit this class, when you need to override bunch of methods, to avoid boilerplate code. +class SerializationWrapper : public ISerialization +{ +protected: + SerializationPtr nested_serialization; + +public: + SerializationWrapper(const SerializationPtr & nested_serialization_) : nested_serialization(nested_serialization_) {} + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/DataTypeWithSimpleSerialization.h b/src/DataTypes/Serializations/SimpleTextSerialization.h similarity index 87% rename from src/DataTypes/DataTypeWithSimpleSerialization.h rename to src/DataTypes/Serializations/SimpleTextSerialization.h index 4f61167fa1b80174240edd5fd9ebc387853f5582..04c5b1902030127d68abfde6fac8173028a7e792 100644 --- a/src/DataTypes/DataTypeWithSimpleSerialization.h +++ b/src/DataTypes/Serializations/SimpleTextSerialization.h @@ -1,15 +1,15 @@ #pragma once -#include +#include namespace DB { -/// Helper class to define same IDataType text (de)serialization for all the variants (escaped, quoted, JSON, CSV). +/// Helper class to define same ISerialization text (de)serialization for all the variants (escaped, quoted, JSON, CSV). /// You need to define serializeText() and deserializeText() in derived class. -class DataTypeWithSimpleSerialization : public IDataType +class SimpleTextSerialization : public ISerialization { protected: - DataTypeWithSimpleSerialization() = default; + SimpleTextSerialization() = default; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override { diff --git a/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp b/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp index 48e2f0d80a099af2ef63b2c3cab766884b84c1a9..9d8c32c92b65a4f4805b93a0fe1e231f430e342a 100644 --- a/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp +++ b/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp @@ -67,7 +67,7 @@ TEST_P(ParseDataTypeTest, parseStringValue) for (const auto & value : p.values) { ReadBuffer buffer(const_cast(value.data()), value.size(), 0); - data_type->deserializeAsWholeText(*col, buffer, FormatSettings{}); + data_type->getDefaultSerialization()->deserializeWholeText(*col, buffer, FormatSettings{}); } ASSERT_EQ(p.expected_values.size(), col->size()) << "Actual items: " << *col; diff --git a/src/DataTypes/ya.make b/src/DataTypes/ya.make index 356424af8ddcc4070c2c51fb223dbbc164b12052..e7294c298e5dd59e94b452860ee6ac33e07df979 100644 --- a/src/DataTypes/ya.make +++ b/src/DataTypes/ya.make @@ -15,7 +15,6 @@ SRCS( DataTypeCustomGeo.cpp DataTypeCustomIPv4AndIPv6.cpp DataTypeCustomSimpleAggregateFunction.cpp - DataTypeCustomSimpleTextSerialization.cpp DataTypeDate.cpp DataTypeDateTime.cpp DataTypeDateTime64.cpp @@ -32,15 +31,37 @@ SRCS( DataTypeNothing.cpp DataTypeNullable.cpp DataTypeNumberBase.cpp - DataTypeOneElementTuple.cpp DataTypeString.cpp DataTypeTuple.cpp DataTypeUUID.cpp DataTypesDecimal.cpp DataTypesNumber.cpp + EnumValues.cpp FieldToDataType.cpp IDataType.cpp NestedUtils.cpp + Serializations/ISerialization.cpp + Serializations/SerializationAggregateFunction.cpp + Serializations/SerializationArray.cpp + Serializations/SerializationCustomSimpleText.cpp + Serializations/SerializationDate.cpp + Serializations/SerializationDateTime.cpp + Serializations/SerializationDateTime64.cpp + Serializations/SerializationDecimal.cpp + Serializations/SerializationDecimalBase.cpp + Serializations/SerializationEnum.cpp + Serializations/SerializationFixedString.cpp + Serializations/SerializationIP.cpp + Serializations/SerializationLowCardinality.cpp + Serializations/SerializationMap.cpp + Serializations/SerializationNothing.cpp + Serializations/SerializationNullable.cpp + Serializations/SerializationNumber.cpp + Serializations/SerializationString.cpp + Serializations/SerializationTuple.cpp + Serializations/SerializationTupleElement.cpp + Serializations/SerializationUUID.cpp + Serializations/SerializationWrapper.cpp convertMySQLDataType.cpp getLeastSupertype.cpp getMostSubtype.cpp diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e5d2b23ace03c895c23a819f6f39d5b7df11c4b9..79373128713cd3025813e1dd8af64708e290d263 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -231,8 +231,8 @@ void DatabaseOnDisk::createTable( if (create.attach_short_syntax) { /// Metadata already exists, table was detached + removeDetachedPermanentlyFlag(context, table_name, table_metadata_path, true); attachTable(table_name, table, getTableDataPath(create)); - removeDetachedPermanentlyFlag(table_name, table_metadata_path); return; } @@ -270,12 +270,12 @@ void DatabaseOnDisk::createTable( commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context); - removeDetachedPermanentlyFlag(table_name, table_metadata_path); + removeDetachedPermanentlyFlag(context, table_name, table_metadata_path, false); } /// If the table was detached permanently we will have a flag file with /// .sql.detached extension, is not needed anymore since we attached the table back -void DatabaseOnDisk::removeDetachedPermanentlyFlag(const String & table_name, const String & table_metadata_path) const +void DatabaseOnDisk::removeDetachedPermanentlyFlag(const Context &, const String & table_name, const String & table_metadata_path, bool) const { try { diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index fefe6e9160621101051d1b4e61982af96d34f4d2..b96a24f3345fae376522836ae43cba9f7a29cdf9 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -94,11 +94,10 @@ protected: virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context); + virtual void removeDetachedPermanentlyFlag(const Context & context, const String & table_name, const String & table_metadata_path, bool attach) const; + const String metadata_path; const String data_path; - -private: - void removeDetachedPermanentlyFlag(const String & table_name, const String & table_metadata_path) const; }; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 12cff3407d3cf69bff03d6711e20f9c39ab5df5b..b4b822df233f4f57b8da2f216644f9383bff8322 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -105,7 +106,22 @@ std::pair DatabaseReplicated::parseFullReplicaName(const String ClusterPtr DatabaseReplicated::getCluster() const { - /// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables + std::lock_guard lock{mutex}; + if (cluster) + return cluster; + + cluster = getClusterImpl(); + return cluster; +} + +void DatabaseReplicated::setCluster(ClusterPtr && new_cluster) +{ + std::lock_guard lock{mutex}; + cluster = std::move(new_cluster); +} + +ClusterPtr DatabaseReplicated::getClusterImpl() const +{ Strings hosts; Strings host_ids; @@ -120,7 +136,7 @@ ClusterPtr DatabaseReplicated::getCluster() const hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat); if (hosts.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); - Int32 cver = stat.cversion; + Int32 cversion = stat.cversion; std::sort(hosts.begin(), hosts.end()); std::vector futures; @@ -139,7 +155,9 @@ ClusterPtr DatabaseReplicated::getCluster() const } zookeeper->get(zookeeper_path + "/replicas", &stat); - if (success && cver == stat.version) + if (cversion != stat.cversion) + success = false; + if (success) break; } if (!success) @@ -157,22 +175,23 @@ ClusterPtr DatabaseReplicated::getCluster() const if (id == DROPPED_MARK) continue; auto [shard, replica] = parseFullReplicaName(hosts[i]); - auto pos = id.find(':'); - String host = id.substr(0, pos); + auto pos = id.rfind(':'); + String host_port = id.substr(0, pos); if (shard != current_shard) { current_shard = shard; if (!shards.back().empty()) shards.emplace_back(); } - shards.back().emplace_back(unescapeForFileName(host)); + shards.back().emplace_back(unescapeForFileName(host_port)); } - /// TODO make it configurable - String username = "default"; - String password; + String username = db_settings.cluster_username; + String password = db_settings.cluster_password; + UInt16 default_port = global_context.getTCPPort(); + bool secure = db_settings.cluster_secure_connection; - return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); + return std::make_shared(global_context.getSettingsRef(), shards, username, password, default_port, false, secure); } void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) @@ -253,11 +272,8 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP __builtin_unreachable(); } -void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) +void DatabaseReplicated::createEmptyLogEntry(Coordination::Requests & ops, const ZooKeeperPtr & current_zookeeper) { - /// Write host name to replica_path, it will protect from multiple replicas with the same name - auto host_id = getHostID(global_context, db_uuid); - /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). DDLLogEntry entry{}; @@ -266,11 +282,20 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); String query_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(query_path + "/committed", getFullReplicaName(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); +} + +void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) +{ + /// Write host name to replica_path, it will protect from multiple replicas with the same name + auto host_id = getHostID(global_context, db_uuid); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + createEmptyLogEntry(ops, current_zookeeper); current_zookeeper->multi(ops); } @@ -294,7 +319,11 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const /// Replicas will set correct name of current database in query context (database name can be different on replicas) if (auto * ddl_query = query->as()) + { + if (ddl_query->database != getDatabaseName()) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); ddl_query->database.clear(); + } if (const auto * query_alter = query->as()) { @@ -305,23 +334,26 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const } } + if (auto * query_drop = query->as()) + { + if (query_drop->kind == ASTDropQuery::Kind::Detach && query_context.getSettingsRef().database_replicated_always_detach_permanently) + query_drop->permanently = true; + if (query_drop->kind == ASTDropQuery::Kind::Detach && !query_drop->permanently) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " + "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA or set " + "database_replicated_always_detach_permanently to 1"); + } + LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); - /// TODO maybe write current settings to log entry? DDLLogEntry entry; entry.query = queryToString(query); entry.initiator = ddl_worker->getCommonHostID(); + entry.setSettingsIfRequired(query_context); String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context); - BlockIO io; - if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0) - return io; - Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); - auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); - if (query_context.getSettingsRef().database_replicated_ddl_output) - io.in = std::move(stream); - return io; + return getDistributedDDLStatus(node_path, entry, query_context, hosts_to_wait); } static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context) @@ -557,12 +589,14 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth); auto & create = ast->as(); - if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty()) + if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || !create.database.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query); + bool is_materialized_view_with_inner_table = create.is_materialized_view && create.to_table_id.empty(); + create.database = getDatabaseName(); create.table = unescapeForFileName(node_name); - create.attach = false; + create.attach = is_materialized_view_with_inner_table; return ast; } @@ -570,8 +604,13 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->set(replica_path, DROPPED_MARK); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeSetRequest(replica_path, DROPPED_MARK, -1)); + createEmptyLogEntry(ops, current_zookeeper); + current_zookeeper->multi(ops); + DatabaseAtomic::drop(context_); + current_zookeeper->tryRemoveRecursive(replica_path); /// TODO it may leave garbage in ZooKeeper if the last node lost connection here if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK) @@ -598,7 +637,7 @@ void DatabaseReplicated::shutdown() void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) { auto txn = context.getZooKeeperMetadataTransaction(); - assert(!ddl_worker->isCurrentlyActive() || txn); + assert(!ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id.")); if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); @@ -702,12 +741,28 @@ void DatabaseReplicated::detachTablePermanently(const Context & context, const S assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->isInitialQuery()) { + /// We have to remove metadata from zookeeper, because we do not distinguish permanently detached tables + /// from attached tables when recovering replica. String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::detachTablePermanently(context, table_name); } +void DatabaseReplicated::removeDetachedPermanentlyFlag(const Context & context, const String & table_name, const String & table_metadata_path, bool attach) const +{ + auto txn = context.getZooKeeperMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->isInitialQuery() && attach) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String statement = readMetadataFile(table_name); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::removeDetachedPermanentlyFlag(context, table_name, table_metadata_path, attach); +} + + String DatabaseReplicated::readMetadataFile(const String & table_name) const { String statement; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index fde53cf2c2940a0543802d3b8dd3ed6e1661cdb6..8f2ccd276276a231edd1dc11c839a89b0b608a04 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -45,6 +45,7 @@ public: const ASTPtr & query) override; void removeDictionary(const Context & context, const String & dictionary_name) override; void detachTablePermanently(const Context & context, const String & table_name) override; + void removeDetachedPermanentlyFlag(const Context & context, const String & table_name, const String & table_metadata_path, bool attach) const override; /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. @@ -76,6 +77,11 @@ private: ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); String readMetadataFile(const String & table_name) const; + ClusterPtr getClusterImpl() const; + void setCluster(ClusterPtr && new_cluster); + + void createEmptyLogEntry(Coordination::Requests & ops, const ZooKeeperPtr & current_zookeeper); + String zookeeper_path; String shard_name; String replica_name; @@ -86,6 +92,8 @@ private: std::atomic_bool is_readonly = true; std::unique_ptr ddl_worker; + + mutable ClusterPtr cluster; }; } diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h index 11d5b3820e4309e74237cf43e047f4030251945f..43003af1120caaa3e3589f0284731d8186a23a42 100644 --- a/src/Databases/DatabaseReplicatedSettings.h +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -11,6 +11,9 @@ class ASTStorage; M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ + M(String, cluster_username, "default", "Username to use when connecting to hosts of cluster", 0) \ + M(String, cluster_password, "", "Password to use when connecting to hosts of cluster", 0) \ + M(Bool, cluster_secure_connection, false, "Enable TLS when connecting to hosts of cluster", 0) \ DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index ee5a3b5eed0764301b13385a32cf8f0cf62bdc3f..b69e76697b0e56a0d8b833a1c28089eca292de88 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -237,6 +237,8 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->entry.query.empty()) { + /// Some replica is added or removed, let's update cached cluster + database->setCluster(database->getClusterImpl()); out_reason = fmt::format("Entry {} is a dummy task", entry_name); return {}; } diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index ea3e3efa03d02f80121330f5b92b4d268a120851..d3ee194bf6019576fd14ff7dae738a205252209f 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -412,7 +412,7 @@ std::vector DictionaryStructure::getAttributes( { ReadBufferFromString null_value_buffer{null_value_string}; auto column_with_null_value = type->createColumn(); - type->deserializeAsTextEscaped(*column_with_null_value, null_value_buffer, format_settings); + type->getDefaultSerialization()->deserializeTextEscaped(*column_with_null_value, null_value_buffer, format_settings); null_value = (*column_with_null_value)[0]; } } @@ -443,6 +443,7 @@ std::vector DictionaryStructure::getAttributes( name, underlying_type, initial_type, + initial_type->getDefaultSerialization(), type, expression, null_value, diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 39332f2dff2241247b9d98c3a43147b4d941d620..2dedb1be0ceedbb572ee31e92f94154636108bf7 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -58,6 +58,7 @@ struct DictionaryAttribute final const std::string name; const AttributeUnderlyingType underlying_type; const DataTypePtr type; + const SerializationPtr serialization; const DataTypePtr nested_type; const std::string expression; const Field null_value; diff --git a/src/Dictionaries/ExternalQueryBuilder.cpp b/src/Dictionaries/ExternalQueryBuilder.cpp index e8d71b1fd85aa663dcc8e915f4d14369b7803083..454c918fd1f5a6552e8f0451a1da82190b4a4c63 100644 --- a/src/Dictionaries/ExternalQueryBuilder.cpp +++ b/src/Dictionaries/ExternalQueryBuilder.cpp @@ -358,7 +358,7 @@ void ExternalQueryBuilder::composeKeyCondition(const Columns & key_columns, cons /// key_i=value_i writeQuoted(key_description.name, out); writeString("=", out); - key_description.type->serializeAsTextQuoted(*key_columns[i], row, out, format_settings); + key_description.serialization->serializeTextQuoted(*key_columns[i], row, out, format_settings); } } @@ -415,7 +415,7 @@ void ExternalQueryBuilder::composeKeyTuple(const Columns & key_columns, const si writeString(", ", out); first = false; - (*dict_struct.key)[i].type->serializeAsTextQuoted(*key_columns[i], row, out, format_settings); + (*dict_struct.key)[i].serialization->serializeTextQuoted(*key_columns[i], row, out, format_settings); } writeString(")", out); diff --git a/src/Dictionaries/readInvalidateQuery.cpp b/src/Dictionaries/readInvalidateQuery.cpp index 9b5b34133b2acc26567ae74781c01dfe204739fe..587fce6631c9c345ae0620c3fa7e72a1ec01bfdc 100644 --- a/src/Dictionaries/readInvalidateQuery.cpp +++ b/src/Dictionaries/readInvalidateQuery.cpp @@ -34,7 +34,7 @@ std::string readInvalidateQuery(IBlockInputStream & block_input_stream) WriteBufferFromOwnString out; auto & column_type = block.getByPosition(0); - column_type.type->serializeAsTextQuoted(*column_type.column->convertToFullColumnIfConst(), 0, out, FormatSettings()); + column_type.type->getDefaultSerialization()->serializeTextQuoted(*column_type.column->convertToFullColumnIfConst(), 0, out, FormatSettings()); while ((block = block_input_stream.read())) if (block.rows() > 0) diff --git a/src/Formats/MySQLBlockInputStream.cpp b/src/Formats/MySQLBlockInputStream.cpp index 29cf749de3bd8936e03f2179beb9b23e18a49762..da2a175e49f31ed15955c4ec4b371a98ecd80cb8 100644 --- a/src/Formats/MySQLBlockInputStream.cpp +++ b/src/Formats/MySQLBlockInputStream.cpp @@ -117,7 +117,7 @@ namespace case ValueType::vtDecimal256: { ReadBuffer buffer(const_cast(value.data()), value.size(), 0); - data_type.deserializeAsWholeText(column, buffer, FormatSettings{}); + data_type.getDefaultSerialization()->deserializeWholeText(column, buffer, FormatSettings{}); break; } case ValueType::vtFixedString: diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index f99888975bc522dfe024a1f397d29ea6b06038a1..3539628f98e9b90378c9add188a7fd91e1df8402 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -24,6 +24,8 @@ # include # include # include +# include +# include # include # include # include @@ -583,11 +585,11 @@ namespace { if (row_num < old_size) { - fixed_string_data_type->alignStringLength(text_buffer, 0); + SerializationFixedString::alignStringLength(n, text_buffer, 0); memcpy(data.data() + row_num * n, text_buffer.data(), n); } else - fixed_string_data_type->alignStringLength(data, old_data_size); + SerializationFixedString::alignStringLength(n, data, old_data_size); } else { @@ -817,7 +819,7 @@ namespace auto str = default_function(); arr.insert(str.data(), str.data() + str.size()); if constexpr (is_fixed_string) - fixed_string_data_type->alignStringLength(arr, 0); + SerializationFixedString::alignStringLength(n, arr, 0); default_string = std::move(arr); } return *default_string; @@ -1326,7 +1328,7 @@ namespace if constexpr (std::is_same_v) readDateTime64Text(decimal, scale, buf); else - DataTypeDecimal::readText(decimal, buf, precision, scale); + SerializationDecimal::readText(decimal, buf, precision, scale); return decimal; } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 5e3831b37a2a9db0765b8b5c57bac5cdd70682b6..10aa1987559e6b58db716ca0fe9d622bfc446b1c 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -736,9 +737,10 @@ struct ConvertImplGenericToString WriteBufferFromVector write_buffer(data_to); FormatSettings format_settings; + auto serialization = type.getDefaultSerialization(); for (size_t i = 0; i < size; ++i) { - type.serializeAsText(col_from, i, write_buffer, format_settings); + serialization->serializeText(col_from, i, write_buffer, format_settings); writeChar(0, write_buffer); offsets_to[i] = write_buffer.count(); } @@ -1026,7 +1028,7 @@ struct ConvertThroughParsing vec_to[i] = value; } else if constexpr (IsDataTypeDecimal) - ToDataType::readText(vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale()); + SerializationDecimal::readText(vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale()); else parseImpl(vec_to[i], read_buffer, local_time_zone); } @@ -1068,7 +1070,7 @@ struct ConvertThroughParsing vec_to[i] = value; } else if constexpr (IsDataTypeDecimal) - parsed = ToDataType::tryReadText(vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale()); + parsed = SerializationDecimal::tryReadText(vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale()); else parsed = tryParseImpl(vec_to[i], read_buffer, local_time_zone); } @@ -1132,11 +1134,12 @@ struct ConvertImplGenericFromString size_t current_offset = 0; FormatSettings format_settings; + auto serialization = data_type_to.getDefaultSerialization(); for (size_t i = 0; i < size; ++i) { ReadBufferFromMemory read_buffer(&chars[current_offset], offsets[i] - current_offset - 1); - data_type_to.deserializeAsWholeText(column_to, read_buffer, format_settings); + serialization->deserializeWholeText(column_to, read_buffer, format_settings); if (!read_buffer.eof()) throwExceptionForIncompletelyParsedValue(read_buffer, result_type); diff --git a/src/Functions/blockSerializedSize.cpp b/src/Functions/blockSerializedSize.cpp index 9c9b380ffef7fa0dc47704a72278c914f18845da..5a66c5d08afc1758c6cc70e2f8138c815c8f1c3f 100644 --- a/src/Functions/blockSerializedSize.cpp +++ b/src/Functions/blockSerializedSize.cpp @@ -44,18 +44,20 @@ public: { ColumnPtr full_column = elem.column->convertToFullColumnIfConst(); - IDataType::SerializeBinaryBulkSettings settings; + ISerialization::SerializeBinaryBulkSettings settings; NullWriteBuffer out; - settings.getter = [&out](IDataType::SubstreamPath) -> WriteBuffer * { return &out; }; + settings.getter = [&out](ISerialization::SubstreamPath) -> WriteBuffer * { return &out; }; - IDataType::SerializeBinaryBulkStatePtr state; + ISerialization::SerializeBinaryBulkStatePtr state; - elem.type->serializeBinaryBulkStatePrefix(settings, state); - elem.type->serializeBinaryBulkWithMultipleStreams(*full_column, + auto serialization = elem.type->getDefaultSerialization(); + + serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkWithMultipleStreams(*full_column, 0 /** offset */, 0 /** limit */, settings, state); - elem.type->serializeBinaryBulkStateSuffix(settings, state); + serialization->serializeBinaryBulkStateSuffix(settings, state); return out.count(); } diff --git a/src/Functions/connectionID.cpp b/src/Functions/connectionID.cpp index 8e9c81aed6c2cbbabffc56f60987756161c832d7..0f45d5cb1849754ac186108d2c9d91bba2883c4e 100644 --- a/src/Functions/connectionID.cpp +++ b/src/Functions/connectionID.cpp @@ -34,8 +34,8 @@ private: void registerFunctionConnectionID(FunctionFactory & factory) { - factory.registerFunction(); - factory.registerAlias("connection_id", "connectionID"); + factory.registerFunction(FunctionFactory::CaseInsensitive); + factory.registerAlias("connection_id", "connectionID", FunctionFactory::CaseInsensitive); } } diff --git a/src/Functions/geometryConverters.h b/src/Functions/geometryConverters.h index 283bb1bb7f43b5f19bb878c1d29f650e7437d0e5..dc0571dcd70ae05e7f66d997c46c0170939c7cf2 100644 --- a/src/Functions/geometryConverters.h +++ b/src/Functions/geometryConverters.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -328,14 +328,16 @@ struct ConverterType template static void callOnGeometryDataType(DataTypePtr type, F && f) { + const auto & factory = DataTypeFactory::instance(); + /// There is no Point type, because for most of geometry functions it is useless. - if (DataTypeCustomPointSerialization::nestedDataType()->equals(*type)) + if (factory.get("Point")->equals(*type)) return f(ConverterType>()); - else if (DataTypeCustomRingSerialization::nestedDataType()->equals(*type)) + else if (factory.get("Ring")->equals(*type)) return f(ConverterType>()); - else if (DataTypeCustomPolygonSerialization::nestedDataType()->equals(*type)) + else if (factory.get("Polygon")->equals(*type)) return f(ConverterType>()); - else if (DataTypeCustomMultiPolygonSerialization::nestedDataType()->equals(*type)) + else if (factory.get("MultiPolygon")->equals(*type)) return f(ConverterType>()); throw Exception(fmt::format("Unknown geometry type {}", type->getName()), ErrorCodes::BAD_ARGUMENTS); } diff --git a/src/Functions/polygonArea.cpp b/src/Functions/polygonArea.cpp index fb1ba7c4a0149c43db3948ad4bd950d1a5847bca..7de0869609ee670344e8622ba44dab5c72fabbbf 100644 --- a/src/Functions/polygonArea.cpp +++ b/src/Functions/polygonArea.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/src/Functions/polygonConvexHull.cpp b/src/Functions/polygonConvexHull.cpp index 5f545cf8ea142c18226a377d3f7b4e21d950c9bc..f445b4e40729624a8912a27b26a5ddff5a680b64 100644 --- a/src/Functions/polygonConvexHull.cpp +++ b/src/Functions/polygonConvexHull.cpp @@ -55,7 +55,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes &) const override { - return DataTypeCustomPolygonSerialization::nestedDataType(); + return DataTypeFactory::instance().get("Polygon"); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override diff --git a/src/Functions/polygonPerimeter.cpp b/src/Functions/polygonPerimeter.cpp index c3aadbd187a8e722f8a78274a57903b49e2fc342..e361fa2a38776c6c2c295346f4fab86354d83058 100644 --- a/src/Functions/polygonPerimeter.cpp +++ b/src/Functions/polygonPerimeter.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Functions/polygonsIntersection.cpp b/src/Functions/polygonsIntersection.cpp index 0de3d023044686b4ea6be5e5e163754a4bcd52af..8eaf6b97b2b32851ecab7d7cba875f3dce2485f7 100644 --- a/src/Functions/polygonsIntersection.cpp +++ b/src/Functions/polygonsIntersection.cpp @@ -57,7 +57,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes &) const override { /// Intersection of each with figure with each could be easily represent as MultiPolygon. - return DataTypeCustomMultiPolygonSerialization::nestedDataType(); + return DataTypeFactory::instance().get("MultiPolygon"); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override diff --git a/src/Functions/polygonsSymDifference.cpp b/src/Functions/polygonsSymDifference.cpp index 9ab6b79c5f59f0872dbe0bf6a7d75a9886d980a3..a7bdf0af32f369a742dc6875ba00244401a968ee 100644 --- a/src/Functions/polygonsSymDifference.cpp +++ b/src/Functions/polygonsSymDifference.cpp @@ -56,7 +56,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes &) const override { - return DataTypeCustomMultiPolygonSerialization::nestedDataType(); + return DataTypeFactory::instance().get("MultiPolygon"); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override diff --git a/src/Functions/polygonsUnion.cpp b/src/Functions/polygonsUnion.cpp index eab2e2e588fb2bc83ba4d019231f8e754abee608..834629f90ad67505006bc8ea9c4e2ab560dc9c4d 100644 --- a/src/Functions/polygonsUnion.cpp +++ b/src/Functions/polygonsUnion.cpp @@ -56,7 +56,7 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes &) const override { - return DataTypeCustomMultiPolygonSerialization::nestedDataType(); + return DataTypeFactory::instance().get("MultiPolygon"); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override diff --git a/src/Functions/readWkt.cpp b/src/Functions/readWkt.cpp index 101902a00f54976d04336a4480261c16c32deb6d..47312be2aa0fb22df001a5f2c3f746bb0fe8e8c8 100644 --- a/src/Functions/readWkt.cpp +++ b/src/Functions/readWkt.cpp @@ -17,7 +17,7 @@ namespace ErrorCodes } -template +template class FunctionReadWkt : public IFunction { public: @@ -43,7 +43,7 @@ public: ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } - return DataType::nestedDataType(); + return DataTypeFactory::instance().get(DataTypeName().getName()); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override @@ -70,7 +70,7 @@ public: static FunctionPtr create(const Context &) { - return std::make_shared>(); + return std::make_shared>(); } }; @@ -96,10 +96,10 @@ struct ReadWktMultiPolygonNameHolder void registerFunctionReadWkt(FunctionFactory & factory) { - factory.registerFunction, ReadWktPointNameHolder>>(); - factory.registerFunction, ReadWktRingNameHolder>>(); - factory.registerFunction, ReadWktPolygonNameHolder>>(); - factory.registerFunction, ReadWktMultiPolygonNameHolder>>(); + factory.registerFunction, ReadWktPointNameHolder>>(); + factory.registerFunction, ReadWktRingNameHolder>>(); + factory.registerFunction, ReadWktPolygonNameHolder>>(); + factory.registerFunction, ReadWktMultiPolygonNameHolder>>(); } } diff --git a/src/Functions/runningConcurrency.cpp b/src/Functions/runningConcurrency.cpp index a225e3152e703cb055033e89ccd847a3f7ae550c..e8be7b9c200b9ee2f71f91c522b389037a6cee2c 100644 --- a/src/Functions/runningConcurrency.cpp +++ b/src/Functions/runningConcurrency.cpp @@ -47,6 +47,8 @@ namespace DB typename ColVecConc::Container & vec_concurrency = col_concurrency->getData(); std::multiset ongoing_until; + auto begin_serializaion = arguments[0].type->getDefaultSerialization(); + auto end_serialization = arguments[1].type->getDefaultSerialization(); for (size_t i = 0; i < input_rows_count; ++i) { const auto begin = vec_begin[i]; @@ -56,8 +58,8 @@ namespace DB { const FormatSettings default_format; WriteBufferFromOwnString buf_begin, buf_end; - arguments[0].type->serializeAsTextQuoted(*(arguments[0].column), i, buf_begin, default_format); - arguments[1].type->serializeAsTextQuoted(*(arguments[1].column), i, buf_end, default_format); + begin_serializaion->serializeTextQuoted(*(arguments[0].column), i, buf_begin, default_format); + end_serialization->serializeTextQuoted(*(arguments[1].column), i, buf_end, default_format); throw Exception( "Incorrect order of events: " + buf_begin.str() + " > " + buf_end.str(), ErrorCodes::INCORRECT_DATA); diff --git a/src/Functions/version.cpp b/src/Functions/version.cpp index 404c03223ad534081f7671d531ff03b4e40786c8..37f9c7b1e98567dcdccb7120d17895502388799e 100644 --- a/src/Functions/version.cpp +++ b/src/Functions/version.cpp @@ -49,7 +49,7 @@ public: void registerFunctionVersion(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(FunctionFactory::CaseInsensitive); } } diff --git a/src/Functions/visibleWidth.cpp b/src/Functions/visibleWidth.cpp index 1fe36208de9567e932244191b0445d2172aac2a5..ed4d2b3d501251dc6fec611ae4ea9b18cb117ddc 100644 --- a/src/Functions/visibleWidth.cpp +++ b/src/Functions/visibleWidth.cpp @@ -58,11 +58,12 @@ public: String tmp; FormatSettings format_settings; + auto serialization = src.type->getDefaultSerialization(); for (size_t i = 0; i < size; ++i) { { WriteBufferFromString out(tmp); - src.type->serializeAsText(*src.column, i, out, format_settings); + serialization->serializeText(*src.column, i, out, format_settings); } res_data[i] = UTF8::countCodePoints(reinterpret_cast(tmp.data()), tmp.size()); diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index b77d5019d48b48865b5a6d24bca1bb6a8c0c01ee..ad759b99c296c1305c34d5e873405c8e1ea70688 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -116,7 +116,9 @@ Cluster::Address::Address( const String & password_, UInt16 clickhouse_port, bool secure_, - Int64 priority_) + Int64 priority_, + UInt32 shard_index_, + UInt32 replica_index_) : user(user_) , password(password_) { @@ -126,6 +128,8 @@ Cluster::Address::Address( secure = secure_ ? Protocol::Secure::Enable : Protocol::Secure::Disable; priority = priority_; is_local = isLocal(clickhouse_port); + shard_index = shard_index_; + replica_index = replica_index_; } @@ -491,7 +495,7 @@ Cluster::Cluster(const Settings & settings, const std::vector #include #include +#include #include #include #include @@ -1800,11 +1801,14 @@ std::optional Context::getTCPPortSecure() const std::shared_ptr Context::getCluster(const std::string & cluster_name) const { auto res = getClusters().getCluster(cluster_name); + if (res) + return res; - if (!res) - throw Exception("Requested cluster '" + cluster_name + "' not found", ErrorCodes::BAD_GET); + res = tryGetReplicatedDatabaseCluster(cluster_name); + if (res) + return res; - return res; + throw Exception("Requested cluster '" + cluster_name + "' not found", ErrorCodes::BAD_GET); } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 0c4c8a1bc3422e85da5f48719bf0caacc4fcf17d..1cfd113e81f5af6f52a9d77f399334f5306762b9 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -43,20 +44,47 @@ bool HostID::isLocalAddress(UInt16 clickhouse_port) const } } +void DDLLogEntry::assertVersion() const +{ + constexpr UInt64 max_version = 2; + if (version == 0 || max_version < version) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}." + "Maximum supported version is {}", version, max_version); +} + +void DDLLogEntry::setSettingsIfRequired(const Context & context) +{ + version = context.getSettingsRef().distributed_ddl_entry_format_version; + if (version == 2) + settings.emplace(context.getSettingsRef().changes()); +} String DDLLogEntry::toString() const { WriteBufferFromOwnString wb; - Strings host_id_strings(hosts.size()); - std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); - - auto version = CURRENT_VERSION; wb << "version: " << version << "\n"; wb << "query: " << escape << query << "\n"; - wb << "hosts: " << host_id_strings << "\n"; + + bool write_hosts = version == 1 || !hosts.empty(); + if (write_hosts) + { + Strings host_id_strings(hosts.size()); + std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); + wb << "hosts: " << host_id_strings << "\n"; + } + wb << "initiator: " << initiator << "\n"; + bool write_settings = 1 <= version && settings && !settings->empty(); + if (write_settings) + { + ASTSetQuery ast; + ast.is_standalone = false; + ast.changes = *settings; + wb << "settings: " << serializeAST(ast) << "\n"; + } + return wb.str(); } @@ -64,25 +92,46 @@ void DDLLogEntry::parse(const String & data) { ReadBufferFromString rb(data); - int version; rb >> "version: " >> version >> "\n"; - - if (version != CURRENT_VERSION) - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version); + assertVersion(); Strings host_id_strings; rb >> "query: " >> escape >> query >> "\n"; - rb >> "hosts: " >> host_id_strings >> "\n"; + if (version == 1) + { + rb >> "hosts: " >> host_id_strings >> "\n"; - if (!rb.eof()) - rb >> "initiator: " >> initiator >> "\n"; - else - initiator.clear(); + if (!rb.eof()) + rb >> "initiator: " >> initiator >> "\n"; + else + initiator.clear(); + } + else if (version == 2) + { + + if (!rb.eof() && *rb.position() == 'h') + rb >> "hosts: " >> host_id_strings >> "\n"; + if (!rb.eof() && *rb.position() == 'i') + rb >> "initiator: " >> initiator >> "\n"; + if (!rb.eof() && *rb.position() == 's') + { + String settings_str; + rb >> "settings: " >> settings_str >> "\n"; + ParserSetQuery parser{true}; + constexpr UInt64 max_size = 4096; + constexpr UInt64 max_depth = 16; + ASTPtr settings_ast = parseQuery(parser, settings_str, max_size, max_depth); + settings.emplace(std::move(settings_ast->as()->changes)); + } + } assertEOF(rb); - hosts.resize(host_id_strings.size()); - std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); + if (!host_id_strings.empty()) + { + hosts.resize(host_id_strings.size()); + std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); + } } @@ -102,6 +151,8 @@ std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context, c query_context->makeQueryContext(); query_context->setCurrentQueryId(""); // generate random query_id query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + if (entry.settings) + query_context->applySettingsChanges(*entry.settings); return query_context; } @@ -345,4 +396,11 @@ void ZooKeeperMetadataTransaction::commit() state = COMMITTED; } +ClusterPtr tryGetReplicatedDatabaseCluster(const String & cluster_name) +{ + if (const auto * replicated_db = dynamic_cast(DatabaseCatalog::instance().tryGetDatabase(cluster_name).get())) + return replicated_db->getCluster(); + return {}; +} + } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 45702599fcf6e2e7e8a5399010f868be3c7a2299..b794668f80272064a338457c8af0021b6bdc003d 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -18,6 +18,7 @@ namespace DB class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; +using ClusterPtr = std::shared_ptr; class DatabaseReplicated; class ZooKeeperMetadataTransaction; @@ -56,15 +57,16 @@ struct HostID struct DDLLogEntry { + UInt64 version = 1; String query; std::vector hosts; String initiator; // optional + std::optional settings; - static constexpr int CURRENT_VERSION = 1; - + void setSettingsIfRequired(const Context & context); String toString() const; - void parse(const String & data); + void assertVersion() const; }; struct DDLTaskBase @@ -192,4 +194,6 @@ public: ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exceptions()); } }; +ClusterPtr tryGetReplicatedDatabaseCluster(const String & cluster_name); + } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f8bcbf02ab48b14f088086a4becc7705498ad56a..8cd93546ad0a6d79e0e31ee8d01a70e171d725ae 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -713,6 +713,19 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const } } +static void generateUUIDForTable(ASTCreateQuery & create) +{ + if (create.uuid == UUIDHelpers::Nil) + create.uuid = UUIDHelpers::generateV4(); + + /// If destination table (to_table_id) is not specified for materialized view, + /// then MV will create inner table. We should generate UUID of inner table here, + /// so it will be the same on all hosts if query in ON CLUSTER or database engine is Replicated. + bool need_uuid_for_inner_table = create.is_materialized_view && !create.to_table_id; + if (need_uuid_for_inner_table && create.to_inner_uuid == UUIDHelpers::Nil) + create.to_inner_uuid = UUIDHelpers::generateV4(); +} + void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const DatabasePtr & database) const { const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; @@ -744,18 +757,19 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data kind_upper, create.table); } - if (create.uuid == UUIDHelpers::Nil) - create.uuid = UUIDHelpers::generateV4(); + generateUUIDForTable(create); } else { bool is_on_cluster = context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - if (create.uuid != UUIDHelpers::Nil && !is_on_cluster) + bool has_uuid = create.uuid != UUIDHelpers::Nil || create.to_inner_uuid != UUIDHelpers::Nil; + if (has_uuid && !is_on_cluster) throw Exception(ErrorCodes::INCORRECT_QUERY, "{} UUID specified, but engine of database {} is not Atomic", kind, create.database); /// Ignore UUID if it's ON CLUSTER query create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; } if (create.replace_table) @@ -804,6 +818,17 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (create.attach && !create.storage && !create.columns_list) { auto database = DatabaseCatalog::instance().getDatabase(database_name); + if (database->getEngineName() == "Replicated") + { + auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, create.table); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) + { + create.database = database_name; + guard->releaseTableLock(); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); + } + } + bool if_not_exists = create.if_not_exists; // Table SQL definition is available even if the table is detached (even permanently) @@ -877,7 +902,6 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); - database = DatabaseCatalog::instance().getDatabase(create.database); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { assertOrSetUUID(create, database); @@ -1136,8 +1160,7 @@ void InterpreterCreateQuery::prepareOnClusterQuery(ASTCreateQuery & create, cons /// For CREATE query generate UUID on initiator, so it will be the same on all hosts. /// It will be ignored if database does not support UUIDs. - if (create.uuid == UUIDHelpers::Nil) - create.uuid = UUIDHelpers::generateV4(); + generateUUIDForTable(create); /// For cross-replication cluster we cannot use UUID in replica path. String cluster_name_expanded = context.getMacros()->expand(cluster_name); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 33e93a79c418caa60c28374c211b1bac9cd9502e..b30996b1dbf62404ce060b4a7be478b354227aed 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -133,10 +133,6 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat !is_drop_or_detach_database; if (is_replicated_ddl_query) { - if (query.kind == ASTDropQuery::Kind::Detach && !query.permanently) - throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " - "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); - if (query.kind == ASTDropQuery::Kind::Detach) context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); else if (query.kind == ASTDropQuery::Kind::Truncate) diff --git a/src/Interpreters/InterpreterShowCreateQuery.cpp b/src/Interpreters/InterpreterShowCreateQuery.cpp index 10c8339c13528421b7df3f690a1e23ba8e3e5f7f..a853c4b9a9bb39b6a2310a3ef0b40f6f394bb597 100644 --- a/src/Interpreters/InterpreterShowCreateQuery.cpp +++ b/src/Interpreters/InterpreterShowCreateQuery.cpp @@ -82,6 +82,7 @@ BlockInputStreamPtr InterpreterShowCreateQuery::executeImpl() { auto & create = create_query->as(); create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; } WriteBufferFromOwnString buf; diff --git a/src/Interpreters/ReplaceQueryParameterVisitor.cpp b/src/Interpreters/ReplaceQueryParameterVisitor.cpp index 9b4223b8947220158cfb03cfbb1ada9570377cd3..8d737f27e6421b8745479fc422df961729839d49 100644 --- a/src/Interpreters/ReplaceQueryParameterVisitor.cpp +++ b/src/Interpreters/ReplaceQueryParameterVisitor.cpp @@ -61,7 +61,7 @@ void ReplaceQueryParameterVisitor::visitQueryParameter(ASTPtr & ast) IColumn & temp_column = *temp_column_ptr; ReadBufferFromString read_buffer{value}; FormatSettings format_settings; - data_type->deserializeAsTextEscaped(temp_column, read_buffer, format_settings); + data_type->getDefaultSerialization()->deserializeTextEscaped(temp_column, read_buffer, format_settings); if (!read_buffer.eof()) throw Exception(ErrorCodes::BAD_QUERY_PARAMETER, diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 5d124add0df3df335226a76e40a8b643ebaea8a7..90b840ce8bdded01c57463917ade29256b37f7d9 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -344,7 +344,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID ReadBufferFromString in_buffer(src.get()); try { - type_to_parse->deserializeAsWholeText(*col, in_buffer, FormatSettings{}); + type_to_parse->getDefaultSerialization()->deserializeWholeText(*col, in_buffer, FormatSettings{}); } catch (Exception & e) { diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index db19c000cfd88e11064d79f76d8a01d919ed3979..a3301bcf55bbd21a310ae7fb5d6d374d9a0b35ef 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -166,9 +166,9 @@ namespace return result; } - Disjunction analyzeFunction(const ASTFunction * fn, const ExpressionActionsPtr & expr) + Disjunction analyzeFunction(const ASTFunction * fn, const ExpressionActionsPtr & expr, size_t & limit) { - if (!fn) + if (!fn || !limit) { return {}; } @@ -182,6 +182,7 @@ namespace const auto * identifier = left->as() ? left->as() : right->as(); const auto * literal = left->as() ? left->as() : right->as(); + --limit; return analyzeEquals(identifier, literal, expr); } else if (fn->name == "in") @@ -192,6 +193,19 @@ namespace Disjunction result; + auto add_dnf = [&](const auto &dnf) + { + if (dnf.size() > limit) + { + result.clear(); + return false; + } + + result.insert(result.end(), dnf.begin(), dnf.end()); + limit -= dnf.size(); + return true; + }; + if (const auto * tuple_func = right->as(); tuple_func && tuple_func->name == "tuple") { const auto * tuple_elements = tuple_func->children.front()->as(); @@ -205,7 +219,10 @@ namespace return {}; } - result.insert(result.end(), dnf.begin(), dnf.end()); + if (!add_dnf(dnf)) + { + return {}; + } } } else if (const auto * tuple_literal = right->as(); @@ -221,7 +238,10 @@ namespace return {}; } - result.insert(result.end(), dnf.begin(), dnf.end()); + if (!add_dnf(dnf)) + { + return {}; + } } } else @@ -244,13 +264,14 @@ namespace for (const auto & arg : args->children) { - const auto dnf = analyzeFunction(arg->as(), expr); + const auto dnf = analyzeFunction(arg->as(), expr, limit); if (dnf.empty()) { return {}; } + /// limit accounted in analyzeFunction() result.insert(result.end(), dnf.begin(), dnf.end()); } @@ -269,13 +290,14 @@ namespace for (const auto & arg : args->children) { - const auto dnf = analyzeFunction(arg->as(), expr); + const auto dnf = analyzeFunction(arg->as(), expr, limit); if (dnf.empty()) { continue; } + /// limit accounted in analyzeFunction() result = andDNF(result, dnf); } @@ -286,15 +308,15 @@ namespace } } -std::optional evaluateExpressionOverConstantCondition(const ASTPtr & node, const ExpressionActionsPtr & target_expr) +std::optional evaluateExpressionOverConstantCondition(const ASTPtr & node, const ExpressionActionsPtr & target_expr, size_t & limit) { Blocks result; if (const auto * fn = node->as()) { - const auto dnf = analyzeFunction(fn, target_expr); + const auto dnf = analyzeFunction(fn, target_expr, limit); - if (dnf.empty()) + if (dnf.empty() || !limit) { return {}; } diff --git a/src/Interpreters/evaluateConstantExpression.h b/src/Interpreters/evaluateConstantExpression.h index 8e3fa08a626dc253dc0c3dd840151fe2a1e0fd15..c797b8461de0b8b413ef8c1591ebdb183213c88e 100644 --- a/src/Interpreters/evaluateConstantExpression.h +++ b/src/Interpreters/evaluateConstantExpression.h @@ -46,10 +46,11 @@ ASTPtr evaluateConstantExpressionForDatabaseName(const ASTPtr & node, const Cont /** Try to fold condition to countable set of constant values. * @param node a condition that we try to fold. * @param target_expr expression evaluated over a set of constants. + * @param limit limit for number of values * @return optional blocks each with a single row and a single column for target expression, * or empty blocks if condition is always false, * or nothing if condition can't be folded to a set of constants. */ -std::optional evaluateExpressionOverConstantCondition(const ASTPtr & node, const ExpressionActionsPtr & target_expr); +std::optional evaluateExpressionOverConstantCondition(const ASTPtr & node, const ExpressionActionsPtr & target_expr, size_t & limit); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index c498eb21379fe1f9dcd89ccab49d0e052886265c..d4e8d06e6134747623b720de12f8b5d09aac8c27 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #include namespace fs = std::filesystem; @@ -160,18 +163,32 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & cont entry.hosts = std::move(hosts); entry.query = queryToString(query_ptr); entry.initiator = ddl_worker.getCommonHostID(); + entry.setSettingsIfRequired(context); String node_path = ddl_worker.enqueueQuery(entry); + return getDistributedDDLStatus(node_path, entry, context); +} + +BlockIO getDistributedDDLStatus(const String & node_path, const DDLLogEntry & entry, const Context & context, const std::optional & hosts_to_wait) +{ BlockIO io; if (context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; - auto stream = std::make_shared(node_path, entry, context); - io.in = std::move(stream); + auto stream = std::make_shared(node_path, entry, context, hosts_to_wait); + if (context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NONE) + { + /// Wait for query to finish, but ignore output + NullBlockOutputStream output{Block{}}; + copyData(*stream, output); + } + else + { + io.in = std::move(stream); + } return io; } - DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_, const std::optional & hosts_to_wait) : node_path(zk_node_path) @@ -179,19 +196,36 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path , watch(CLOCK_MONOTONIC_COARSE) , log(&Poco::Logger::get("DDLQueryStatusInputStream")) { + if (context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::THROW || + context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NONE) + throw_on_timeout = true; + else if (context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT || + context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NEVER_THROW) + throw_on_timeout = false; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown output mode"); + + auto maybe_make_nullable = [&](const DataTypePtr & type) -> DataTypePtr + { + if (throw_on_timeout) + return type; + return std::make_shared(type); + }; + sample = Block{ - {std::make_shared(), "host"}, - {std::make_shared(), "port"}, - {std::make_shared(), "status"}, - {std::make_shared(), "error"}, - {std::make_shared(), "num_hosts_remaining"}, - {std::make_shared(), "num_hosts_active"}, + {std::make_shared(), "host"}, + {std::make_shared(), "port"}, + {maybe_make_nullable(std::make_shared()), "status"}, + {maybe_make_nullable(std::make_shared()), "error"}, + {std::make_shared(), "num_hosts_remaining"}, + {std::make_shared(), "num_hosts_active"}, }; if (hosts_to_wait) { waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end()); by_hostname = false; + sample.erase("port"); } else { @@ -204,12 +238,29 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; } +std::pair DDLQueryStatusInputStream::parseHostAndPort(const String & host_id) const +{ + String host = host_id; + UInt16 port = 0; + if (by_hostname) + { + auto host_and_port = Cluster::Address::fromString(host_id); + host = host_and_port.first; + port = host_and_port.second; + } + return {host, port}; +} + Block DDLQueryStatusInputStream::readImpl() { Block res; - if (num_hosts_finished >= waiting_hosts.size()) + bool all_hosts_finished = num_hosts_finished >= waiting_hosts.size(); + /// Seems like num_hosts_finished cannot be strictly greater than waiting_hosts.size() + assert(num_hosts_finished <= waiting_hosts.size()); + if (all_hosts_finished || timeout_exceeded) { - if (first_exception) + bool throw_if_error_on_host = context.getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW; + if (first_exception && throw_if_error_on_host) throw Exception(*first_exception); return res; @@ -222,7 +273,8 @@ Block DDLQueryStatusInputStream::readImpl() { if (isCancelled()) { - if (first_exception) + bool throw_if_error_on_host = context.getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW; + if (first_exception && throw_if_error_on_host) throw Exception(*first_exception); return res; @@ -233,11 +285,36 @@ Block DDLQueryStatusInputStream::readImpl() size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; size_t num_active_hosts = current_active_hosts.size(); + constexpr const char * msg_format = "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " + "There are {} unfinished hosts ({} of them are currently active), " + "they are going to execute the query in background"; + if (throw_on_timeout) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, msg_format, + node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + + timeout_exceeded = true; + LOG_INFO(log, msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + + NameSet unfinished_hosts = waiting_hosts; + for (const auto & host_id : finished_hosts) + unfinished_hosts.erase(host_id); - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " - "There are {} unfinished hosts ({} of them are currently active), they are going to execute the query in background", - node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + /// Query is not finished on the rest hosts, so fill the corresponding rows with NULLs. + MutableColumns columns = sample.cloneEmptyColumns(); + for (const String & host_id : unfinished_hosts) + { + auto [host, port] = parseHostAndPort(host_id); + size_t num = 0; + columns[num++]->insert(host); + if (by_hostname) + columns[num++]->insert(port); + columns[num++]->insert(Field{}); + columns[num++]->insert(Field{}); + columns[num++]->insert(num_unfinished_hosts); + columns[num++]->insert(num_active_hosts); + } + res = sample.cloneWithColumns(std::move(columns)); + return res; } if (num_hosts_finished != 0 || try_number != 0) @@ -269,26 +346,21 @@ Block DDLQueryStatusInputStream::readImpl() status.tryDeserializeText(status_data); } - String host = host_id; - UInt16 port = 0; - if (by_hostname) - { - auto host_and_port = Cluster::Address::fromString(host_id); - host = host_and_port.first; - port = host_and_port.second; - } + auto [host, port] = parseHostAndPort(host_id); if (status.code != 0 && first_exception == nullptr) first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); ++num_hosts_finished; - columns[0]->insert(host); - columns[1]->insert(port); - columns[2]->insert(status.code); - columns[3]->insert(status.message); - columns[4]->insert(waiting_hosts.size() - num_hosts_finished); - columns[5]->insert(current_active_hosts.size()); + size_t num = 0; + columns[num++]->insert(host); + if (by_hostname) + columns[num++]->insert(port); + columns[num++]->insert(status.code); + columns[num++]->insert(status.message); + columns[num++]->insert(waiting_hosts.size() - num_hosts_finished); + columns[num++]->insert(current_active_hosts.size()); } res = sample.cloneWithColumns(std::move(columns)); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 1bcbff3617877faede5ce9edc7b6b0da9203c8da..a33b89d0cb3b8545752e6ad5a25fe873e2e84467 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -24,6 +24,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & conte BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access); BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, AccessRightsElements && query_requires_access); +BlockIO getDistributedDDLStatus(const String & node_path, const DDLLogEntry & entry, const Context & context, const std::optional & hosts_to_wait = {}); class DDLQueryStatusInputStream final : public IBlockInputStream { @@ -44,6 +45,8 @@ private: Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts); + std::pair parseHostAndPort(const String & host_id) const; + String node_path; const Context & context; Stopwatch watch; @@ -62,6 +65,8 @@ private: Int64 timeout_seconds = 120; bool by_hostname = true; + bool throw_on_timeout = true; + bool timeout_exceeded = false; }; } diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index 2af0d2d4a4561e8e021a9a88dc06c535fa12d334..1192fcc6ebd8aa061c384b01017ba90dc01f561e 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -297,12 +297,20 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (to_table_id) { + assert(is_materialized_view && to_inner_uuid == UUIDHelpers::Nil); settings.ostr << (settings.hilite ? hilite_keyword : "") << " TO " << (settings.hilite ? hilite_none : "") << (!to_table_id.database_name.empty() ? backQuoteIfNeed(to_table_id.database_name) + "." : "") << backQuoteIfNeed(to_table_id.table_name); } + if (to_inner_uuid != UUIDHelpers::Nil) + { + assert(is_materialized_view && !to_table_id); + settings.ostr << (settings.hilite ? hilite_keyword : "") << " TO INNER UUID " << (settings.hilite ? hilite_none : "") + << quoteString(toString(to_inner_uuid)); + } + if (!as_table.empty()) { settings.ostr diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index c9a6251cb948624f12fba5c2498e642391347eb3..d6d5c22240c302cec1806e16e49653c9d28c3cfc 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -66,6 +66,7 @@ public: ASTExpressionList * tables = nullptr; StorageID to_table_id = StorageID::createEmpty(); /// For CREATE MATERIALIZED VIEW mv TO table. + UUID to_inner_uuid = UUIDHelpers::Nil; /// For materialized view with inner table ASTStorage * storage = nullptr; String as_database; String as_table; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index d5a39ca505ef2d9ecea0b5df13b8a0ffa14b8b5f..3cb2e8bfa370e1bae3a51d4633dfeb58581fdb89 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -214,27 +214,81 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format for (const char ** func = operators; *func; func += 2) { - if (0 == strcmp(name.c_str(), func[0])) + if (strcmp(name.c_str(), func[0]) != 0) { - if (frame.need_parens) - settings.ostr << '('; + continue; + } - settings.ostr << (settings.hilite ? hilite_operator : "") << func[1] << (settings.hilite ? hilite_none : ""); + const auto * literal = arguments->children[0]->as(); + /* A particularly stupid case. If we have a unary minus before + * a literal that is a negative number "-(-1)" or "- -1", this + * can not be formatted as `--1`, since this will be + * interpreted as a comment. Instead, negate the literal + * in place. Another possible solution is to use parentheses, + * but the old comment said it is impossible, without mentioning + * the reason. + */ + if (literal && name == "negate") + { + written = applyVisitor( + [&settings](const auto & value) + // -INT_MAX is negated to -INT_MAX by the negate() + // function, so we can implement this behavior here as + // well. Technically it is an UB to perform such negation + // w/o a cast to unsigned type. + NO_SANITIZE_UNDEFINED + { + using ValueType = std::decay_t; + if constexpr (isDecimalField()) + { + // The parser doesn't create decimal literals, but + // they can be produced by constant folding or the + // fuzzer. + const auto int_value = value.getValue().value; + // We compare to zero so we don't care about scale. + if (int_value >= 0) + { + return false; + } + + settings.ostr << ValueType{-int_value, + value.getScale()}; + } + else if constexpr (std::is_arithmetic_v) + { + if (value >= 0) + { + return false; + } + // We don't need parentheses around a single + // literal. + settings.ostr << -value; + return true; + } + + return false; + }, + literal->value); + + if (written) + { + break; + } + } - /** A particularly stupid case. If we have a unary minus before a literal that is a negative number - * "-(-1)" or "- -1", this can not be formatted as `--1`, since this will be interpreted as a comment. - * Instead, add a space. - * PS. You can not just ask to add parentheses - see formatImpl for ASTLiteral. - */ - if (name == "negate" && arguments->children[0]->as()) - settings.ostr << ' '; + // We don't need parentheses around a single literal. + if (!literal && frame.need_parens) + settings.ostr << '('; - arguments->formatImpl(settings, state, nested_need_parens); - written = true; + settings.ostr << (settings.hilite ? hilite_operator : "") << func[1] << (settings.hilite ? hilite_none : ""); - if (frame.need_parens) - settings.ostr << ')'; - } + arguments->formatImpl(settings, state, nested_need_parens); + written = true; + + if (!literal && frame.need_parens) + settings.ostr << ')'; + + break; } } diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 4cef79fdf42b01f6b6da2bfc5c6f77e546f67945..bfd51b7633d3763ee54c7ead8aa55436215cf790 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -780,6 +780,7 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ASTPtr table; ASTPtr to_table; + ASTPtr to_inner_uuid; ASTPtr columns_list; ASTPtr storage; ASTPtr as_database; @@ -830,9 +831,16 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec return false; } - // TO [db.]table - if (ParserKeyword{"TO"}.ignore(pos, expected)) + + if (ParserKeyword{"TO INNER UUID"}.ignore(pos, expected)) + { + ParserLiteral literal_p; + if (!literal_p.parse(pos, to_inner_uuid, expected)) + return false; + } + else if (ParserKeyword{"TO"}.ignore(pos, expected)) { + // TO [db.]table if (!table_name_p.parse(pos, to_table, expected)) return false; } @@ -883,6 +891,8 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec if (to_table) query->to_table_id = getTableIdentifier(to_table); + if (to_inner_uuid) + query->to_inner_uuid = parseFromString(to_inner_uuid->as()->value.get()); query->set(query->columns_list, columns_list); query->set(query->storage, storage); diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index 79090ae2b89c146abb23331ccce20965a89c44c7..75a9abf6845e26a9552f1fb44a3737142ef0be74 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -39,6 +39,16 @@ bool isParseError(int code) || code == ErrorCodes::INCORRECT_DATA; /// For some ReadHelpers } +IRowInputFormat::IRowInputFormat(Block header, ReadBuffer & in_, Params params_) + : IInputFormat(std::move(header), in_), params(params_) +{ + const auto & port_header = getPort().getHeader(); + size_t num_columns = port_header.columns(); + serializations.resize(num_columns); + for (size_t i = 0; i < num_columns; ++i) + serializations[i] = port_header.getByPosition(i).type->getDefaultSerialization(); +} + Chunk IRowInputFormat::generate() { diff --git a/src/Processors/Formats/IRowInputFormat.h b/src/Processors/Formats/IRowInputFormat.h index b78637040626821ae2ee53fed20ddc7d516a963c..c802bd3066b6c67fde3e548de404f534bb7924bf 100644 --- a/src/Processors/Formats/IRowInputFormat.h +++ b/src/Processors/Formats/IRowInputFormat.h @@ -40,13 +40,7 @@ class IRowInputFormat : public IInputFormat public: using Params = RowInputFormatParams; - IRowInputFormat( - Block header, - ReadBuffer & in_, - Params params_) - : IInputFormat(std::move(header), in_), params(params_) - { - } + IRowInputFormat(Block header, ReadBuffer & in_, Params params_); Chunk generate() override; @@ -76,6 +70,8 @@ protected: size_t getTotalRows() const { return total_rows; } + Serializations serializations; + private: Params params; diff --git a/src/Processors/Formats/IRowOutputFormat.cpp b/src/Processors/Formats/IRowOutputFormat.cpp index f5f01643f4ea57c688dce7876f115fa1ac5326ce..b714844feeaf1de15eff303ffe3611e2d1df1127 100644 --- a/src/Processors/Formats/IRowOutputFormat.cpp +++ b/src/Processors/Formats/IRowOutputFormat.cpp @@ -10,6 +10,16 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +IRowOutputFormat::IRowOutputFormat(const Block & header, WriteBuffer & out_, const Params & params_) + : IOutputFormat(header, out_) + , types(header.getDataTypes()) + , params(params_) +{ + serializations.reserve(types.size()); + for (const auto & type : types) + serializations.push_back(type->getDefaultSerialization()); +} + void IRowOutputFormat::consume(DB::Chunk chunk) { writePrefixIfNot(); @@ -82,7 +92,7 @@ void IRowOutputFormat::write(const Columns & columns, size_t row_num) if (i != 0) writeFieldDelimiter(); - writeField(*columns[i], *types[i], row_num); + writeField(*columns[i], *serializations[i], row_num); } writeRowEndDelimiter(); diff --git a/src/Processors/Formats/IRowOutputFormat.h b/src/Processors/Formats/IRowOutputFormat.h index 4fb94f7b7f77419b878b0d1304a800e1cf4b9d22..c35d93b6133ac2af1570208dcfc8f52fc25f128c 100644 --- a/src/Processors/Formats/IRowOutputFormat.h +++ b/src/Processors/Formats/IRowOutputFormat.h @@ -25,6 +25,7 @@ class IRowOutputFormat : public IOutputFormat { protected: DataTypes types; + Serializations serializations; bool first_row = true; void consume(Chunk chunk) override; @@ -35,10 +36,7 @@ protected: public: using Params = RowOutputFormatParams; - IRowOutputFormat(const Block & header, WriteBuffer & out_, const Params & params_) - : IOutputFormat(header, out_), types(header.getDataTypes()), params(params_) - { - } + IRowOutputFormat(const Block & header, WriteBuffer & out_, const Params & params_); /** Write a row. * Default implementation calls methods to write single values and delimiters @@ -50,7 +48,7 @@ public: virtual void writeTotals(const Columns & columns, size_t row_num); /** Write single value. */ - virtual void writeField(const IColumn & column, const IDataType & type, size_t row_num) = 0; + virtual void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) = 0; /** Write delimiter. */ virtual void writeFieldDelimiter() {} /// delimiter between values diff --git a/src/Processors/Formats/Impl/AvroRowOutputFormat.h b/src/Processors/Formats/Impl/AvroRowOutputFormat.h index 08370154d9a735d34f45db99a9344d6a05ca8f60..8d0581d330734eb40995722a9707abe8a7b028e6 100644 --- a/src/Processors/Formats/Impl/AvroRowOutputFormat.h +++ b/src/Processors/Formats/Impl/AvroRowOutputFormat.h @@ -48,7 +48,7 @@ public: String getName() const override { return "AvroRowOutputFormat"; } void write(const Columns & columns, size_t row_num) override; - void writeField(const IColumn &, const IDataType &, size_t) override {} + void writeField(const IColumn &, const ISerialization &, size_t) override {} virtual void writePrefix() override; virtual void writeSuffix() override; diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index f49f521d4748a5d61cb9f609f502f552d4e3f254..36b57e242d73f1961389e4265200d4dea4b83a7b 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -20,7 +20,7 @@ bool BinaryRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) size_t num_columns = columns.size(); for (size_t i = 0; i < num_columns; ++i) - getPort().getHeader().getByPosition(i).type->deserializeBinary(*columns[i], in); + serializations[i]->deserializeBinary(*columns[i], in); return true; } diff --git a/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp index d74a0a075fe668fd447b28e2247dcd31f79a1d09..424eb375fa3551b35bfb92d1a9870bd5cd17ed19 100644 --- a/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp @@ -41,9 +41,9 @@ void BinaryRowOutputFormat::writePrefix() } } -void BinaryRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void BinaryRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - type.serializeBinary(column, row_num, out); + serialization.serializeBinary(column, row_num, out); } diff --git a/src/Processors/Formats/Impl/BinaryRowOutputFormat.h b/src/Processors/Formats/Impl/BinaryRowOutputFormat.h index 562ed7b18aa85dee247400872058947b410f88d9..36a62098b7578b31285bce9609382fc1ee84d879 100644 --- a/src/Processors/Formats/Impl/BinaryRowOutputFormat.h +++ b/src/Processors/Formats/Impl/BinaryRowOutputFormat.h @@ -21,7 +21,7 @@ public: String getName() const override { return "BinaryRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writePrefix() override; String getContentType() const override { return "application/octet-stream"; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 4cec07f38dc55b3a5142eb25b1ed4ad745b91338..00381ab96d095371a37924770e679d1637b30a07 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -230,7 +230,9 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext if (table_column) { skipWhitespacesAndTabs(in); - ext.read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column], is_last_file_column); + ext.read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column], + serializations[*table_column], is_last_file_column); + if (!ext.read_columns[*table_column]) have_default_columns = true; skipWhitespacesAndTabs(in); @@ -360,10 +362,11 @@ void CSVRowInputFormat::syncAfterError() void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { - if (column_mapping->column_indexes_for_input_fields[file_column]) + const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; + if (index) { const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, is_last_file_column); + readField(column, type, serializations[*index], is_last_file_column); } else { @@ -372,7 +375,7 @@ void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & } } -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column) +bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column) { const bool at_delimiter = !in.eof() && *in.position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column @@ -395,12 +398,12 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bo else if (format_settings.null_as_default && !type->isNullable()) { /// If value is null but type is not nullable then use default value instead. - return DataTypeNullable::deserializeTextCSV(column, in, format_settings, type); + return SerializationNullable::deserializeTextCSVImpl(column, in, format_settings, serialization); } else { /// Read the column normally. - type->deserializeAsTextCSV(column, in, format_settings); + serialization->deserializeTextCSV(column, in, format_settings); return true; } } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 86e41cf0a43b1362ce51df939218368bb62d8034..230acc5126866623d738e7227a9b1fdce9c8f86b 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -52,7 +52,7 @@ private: return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t'; } - bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column); + bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column); }; } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp index 90fc768d311ea9aea09b61395a75b09ba88dce4b..b9945ddec15c1c426147e8a7ebe357f48f7b9cbb 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp @@ -40,9 +40,9 @@ void CSVRowOutputFormat::doWritePrefix() } -void CSVRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void CSVRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - type.serializeAsTextCSV(column, row_num, out, format_settings); + serialization.serializeTextCSV(column, row_num, out, format_settings); } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.h b/src/Processors/Formats/Impl/CSVRowOutputFormat.h index 55803aeb53e2b37e1763a52dc7edddb6c0445f83..780a6c4d3ce5b7eb7c405f13e4777bac22c7c235 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.h @@ -24,7 +24,7 @@ public: String getName() const override { return "CSVRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowEndDelimiter() override; void writeBeforeTotals() override; diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index 3e870b409d30ad6b8e94afece6ae26af8cf5562d..caf57ded8b7104934e189ec337bb53bc3519ba3e 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -305,6 +305,7 @@ ConstantExpressionTemplate::TemplateStructure::TemplateStructure(LiteralsInfo & /// Make sequence of tokens and determine IDataType by Field::Types:Which for each literal. token_after_literal_idx.reserve(replaced_literals.size()); special_parser.resize(replaced_literals.size()); + serializations.resize(replaced_literals.size()); TokenIterator prev_end = expression_begin; for (size_t i = 0; i < replaced_literals.size(); ++i) @@ -325,6 +326,8 @@ ConstantExpressionTemplate::TemplateStructure::TemplateStructure(LiteralsInfo & literals.insert({nullptr, info.type, info.dummy_column_name}); prev_end = info.literal->end.value(); + + serializations[i] = info.type->getDefaultSerialization(); } while (prev_end < expression_end) @@ -458,7 +461,7 @@ bool ConstantExpressionTemplate::tryParseExpression(ReadBuffer & istr, const For return false; } else - type->deserializeAsTextQuoted(*columns[cur_column], istr, format_settings); + structure->serializations[cur_column]->deserializeTextQuoted(*columns[cur_column], istr, format_settings); ++cur_column; } diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.h b/src/Processors/Formats/Impl/ConstantExpressionTemplate.h index 299ce4c9925ed50b355424495048672625a31d72..4317cf4a3da1fcc66104285fbfbe965bf7a4f85b 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.h +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.h @@ -36,6 +36,7 @@ class ConstantExpressionTemplate : boost::noncopyable Block literals; ExpressionActionsPtr actions_on_literals; + Serializations serializations; std::vector special_parser; bool null_as_default; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 1fc5041b1f3cc3129c4c599727e5b3abaf8bfcf0..682a4fbf69aefff8eb62facc4bfd4d67838c4b7f 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB { @@ -202,6 +202,7 @@ void JSONCompactEachRowRowInputFormat::readField(size_t index, MutableColumns & { read_columns[index] = true; const auto & type = data_types[index]; + const auto & serialization = serializations[index]; if (yield_strings) { @@ -211,16 +212,16 @@ void JSONCompactEachRowRowInputFormat::readField(size_t index, MutableColumns & ReadBufferFromString buf(str); if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = DataTypeNullable::deserializeWholeText(*columns[index], buf, format_settings, type); + read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization); else - type->deserializeAsWholeText(*columns[index], buf, format_settings); + serialization->deserializeWholeText(*columns[index], buf, format_settings); } else { if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = DataTypeNullable::deserializeTextJSON(*columns[index], in, format_settings, type); + read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], in, format_settings, serialization); else - type->deserializeAsTextJSON(*columns[index], in, format_settings); + serialization->deserializeTextJSON(*columns[index], in, format_settings); } } catch (Exception & e) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp index 11134499984638ecae33344daeefb6f1374f4ea5..a3055873c0170de3de4f83438ebf9c2496eaa9f2 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp @@ -22,17 +22,17 @@ JSONCompactEachRowRowOutputFormat::JSONCompactEachRowRowOutputFormat(WriteBuffer } -void JSONCompactEachRowRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void JSONCompactEachRowRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { if (yield_strings) { WriteBufferFromOwnString buf; - type.serializeAsText(column, row_num, buf, settings); + serialization.serializeText(column, row_num, buf, settings); writeJSONString(buf.str(), out, settings); } else - type.serializeAsTextJSON(column, row_num, out, settings); + serialization.serializeTextJSON(column, row_num, out, settings); } @@ -63,7 +63,7 @@ void JSONCompactEachRowRowOutputFormat::writeTotals(const Columns & columns, siz if (i != 0) JSONCompactEachRowRowOutputFormat::writeFieldDelimiter(); - JSONCompactEachRowRowOutputFormat::writeField(*columns[i], *types[i], row_num); + JSONCompactEachRowRowOutputFormat::writeField(*columns[i], *serializations[i], row_num); } writeCString("]\n", out); } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h index 3d4b80247b82fe6120fece5773f36f21544e1209..792eb906f4bd5c5c2581435a8170e8f5ddf08884 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h @@ -31,7 +31,7 @@ public: void writeTotals(const Columns & columns, size_t row_num) override; void writeAfterTotals() override {} - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowStartDelimiter() override; void writeRowEndDelimiter() override; diff --git a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp index 97304afbebd9bcfae677b791dc9d24621b055eb7..cefaded6912ba3df0331d44b29fa14dea714cd97 100644 --- a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp @@ -18,17 +18,17 @@ JSONCompactRowOutputFormat::JSONCompactRowOutputFormat( } -void JSONCompactRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void JSONCompactRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { if (yield_strings) { WriteBufferFromOwnString buf; - type.serializeAsText(column, row_num, buf, settings); + serialization.serializeText(column, row_num, buf, settings); writeJSONString(buf.str(), *ostr, settings); } else - type.serializeAsTextJSON(column, row_num, *ostr, settings); + serialization.serializeTextJSON(column, row_num, *ostr, settings); ++field_number; } @@ -82,7 +82,7 @@ void JSONCompactRowOutputFormat::writeExtremesElement(const char * title, const if (i != 0) writeTotalsFieldDelimiter(); - writeField(*columns[i], *types[i], row_num); + writeField(*columns[i], *serializations[i], row_num); } writeChar(']', *ostr); diff --git a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h index 71ba3579837b7a9cac4bebfd2f9664660d1bd50a..9bb433c50b13fa488ac404bc68e58783bedeeb6e 100644 --- a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h @@ -25,7 +25,7 @@ public: String getName() const override { return "JSONCompactRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowStartDelimiter() override; void writeRowEndDelimiter() override; @@ -36,9 +36,9 @@ public: protected: void writeExtremesElement(const char * title, const Columns & columns, size_t row_num) override; - void writeTotalsField(const IColumn & column, const IDataType & type, size_t row_num) override + void writeTotalsField(const IColumn & column, const ISerialization & serialization, size_t row_num) override { - return writeField(column, type, row_num); + return writeField(column, serialization, row_num); } void writeTotalsFieldDelimiter() override; diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 720b606be4f89acc48ed44bad2abc8faba28bdd6..e0f6514295b26f7e83dfc503c6a7cab450060bee 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace DB { @@ -140,6 +140,7 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns { seen_columns[index] = read_columns[index] = true; const auto & type = getPort().getHeader().getByPosition(index).type; + const auto & serialization = serializations[index]; if (yield_strings) { @@ -149,16 +150,16 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns ReadBufferFromString buf(str); if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = DataTypeNullable::deserializeWholeText(*columns[index], buf, format_settings, type); + read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization); else - type->deserializeAsWholeText(*columns[index], buf, format_settings); + serialization->deserializeWholeText(*columns[index], buf, format_settings); } else { if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = DataTypeNullable::deserializeTextJSON(*columns[index], in, format_settings, type); + read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], in, format_settings, serialization); else - type->deserializeAsTextJSON(*columns[index], in, format_settings); + serialization->deserializeTextJSON(*columns[index], in, format_settings); } } catch (Exception & e) diff --git a/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.cpp index 30cd0660682eacd63b7907f1e205f2c3f85fa7b4..a69499de813ffb1eee8b222dac0963a292c9710b 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.cpp @@ -28,7 +28,7 @@ JSONEachRowRowOutputFormat::JSONEachRowRowOutputFormat( } -void JSONEachRowRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void JSONEachRowRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { writeString(fields[field_number], out); writeChar(':', out); @@ -37,11 +37,11 @@ void JSONEachRowRowOutputFormat::writeField(const IColumn & column, const IDataT { WriteBufferFromOwnString buf; - type.serializeAsText(column, row_num, buf, settings); + serialization.serializeText(column, row_num, buf, settings); writeJSONString(buf.str(), out, settings); } else - type.serializeAsTextJSON(column, row_num, out, settings); + serialization.serializeTextJSON(column, row_num, out, settings); ++field_number; } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.h index 38760379056f4c78974b00363c4aed81e8f5a63d..10b15f3e7b264607159e2c73e4fc66a201634632 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.h @@ -23,7 +23,7 @@ public: String getName() const override { return "JSONEachRowRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowStartDelimiter() override; void writeRowEndDelimiter() override; diff --git a/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp index 517f126060ff45b08af6e6a38b2a950847ef9710..38c6eefac1c779f8a6793d57bece9b5edf0054ad 100644 --- a/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp @@ -71,7 +71,7 @@ void JSONRowOutputFormat::writePrefix() } -void JSONRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void JSONRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { writeCString("\t\t\t", *ostr); writeString(fields[field_number].name, *ostr); @@ -81,16 +81,16 @@ void JSONRowOutputFormat::writeField(const IColumn & column, const IDataType & t { WriteBufferFromOwnString buf; - type.serializeAsText(column, row_num, buf, settings); + serialization.serializeText(column, row_num, buf, settings); writeJSONString(buf.str(), *ostr, settings); } else - type.serializeAsTextJSON(column, row_num, *ostr, settings); + serialization.serializeTextJSON(column, row_num, *ostr, settings); ++field_number; } -void JSONRowOutputFormat::writeTotalsField(const IColumn & column, const IDataType & type, size_t row_num) +void JSONRowOutputFormat::writeTotalsField(const IColumn & column, const ISerialization & serialization, size_t row_num) { writeCString("\t\t", *ostr); writeString(fields[field_number].name, *ostr); @@ -100,11 +100,11 @@ void JSONRowOutputFormat::writeTotalsField(const IColumn & column, const IDataTy { WriteBufferFromOwnString buf; - type.serializeAsText(column, row_num, buf, settings); + serialization.serializeText(column, row_num, buf, settings); writeJSONString(buf.str(), *ostr, settings); } else - type.serializeAsTextJSON(column, row_num, *ostr, settings); + serialization.serializeTextJSON(column, row_num, *ostr, settings); ++field_number; } @@ -159,7 +159,7 @@ void JSONRowOutputFormat::writeTotals(const Columns & columns, size_t row_num) if (i != 0) writeTotalsFieldDelimiter(); - writeTotalsField(*columns[i], *types[i], row_num); + writeTotalsField(*columns[i], *serializations[i], row_num); } } @@ -191,7 +191,7 @@ void JSONRowOutputFormat::writeExtremesElement(const char * title, const Columns if (i != 0) writeFieldDelimiter(); - writeField(*columns[i], *types[i], row_num); + writeField(*columns[i], *serializations[i], row_num); } writeChar('\n', *ostr); diff --git a/src/Processors/Formats/Impl/JSONRowOutputFormat.h b/src/Processors/Formats/Impl/JSONRowOutputFormat.h index 88b74afbabdf3d641ea13fe90e886a96e8b1f20b..75d4aa5d2018d17bba16f5a050547377da2d5c6c 100644 --- a/src/Processors/Formats/Impl/JSONRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONRowOutputFormat.h @@ -25,7 +25,7 @@ public: String getName() const override { return "JSONRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowStartDelimiter() override; void writeRowEndDelimiter() override; @@ -63,7 +63,7 @@ public: String getContentType() const override { return "application/json; charset=UTF-8"; } protected: - virtual void writeTotalsField(const IColumn & column, const IDataType & type, size_t row_num); + virtual void writeTotalsField(const IColumn & column, const ISerialization & serialization, size_t row_num); virtual void writeExtremesElement(const char * title, const Columns & columns, size_t row_num); virtual void writeTotalsFieldDelimiter() { writeFieldDelimiter(); } diff --git a/src/Processors/Formats/Impl/MarkdownRowOutputFormat.cpp b/src/Processors/Formats/Impl/MarkdownRowOutputFormat.cpp index 51bba07d995c22a5017326d0e8a6ebb131179743..5108650ff0d347ca4ed7da3bd05e7e85ee493510 100644 --- a/src/Processors/Formats/Impl/MarkdownRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/MarkdownRowOutputFormat.cpp @@ -50,9 +50,9 @@ void MarkdownRowOutputFormat::writeRowEndDelimiter() writeCString(" |\n", out); } -void MarkdownRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void MarkdownRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - type.serializeAsTextEscaped(column, row_num, out, format_settings); + serialization.serializeTextEscaped(column, row_num, out, format_settings); } void registerOutputFormatProcessorMarkdown(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/MarkdownRowOutputFormat.h b/src/Processors/Formats/Impl/MarkdownRowOutputFormat.h index 6bfb763d818d7edc4c94989052bfa3c781152ba5..0b2a4dd0b23065b7266173ed99e8ebbd4a1e6851 100644 --- a/src/Processors/Formats/Impl/MarkdownRowOutputFormat.h +++ b/src/Processors/Formats/Impl/MarkdownRowOutputFormat.h @@ -28,7 +28,7 @@ public: /// Write '|\n' after each row void writeRowEndDelimiter() override ; - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; String getName() const override { return "MarkdownRowOutputFormat"; } protected: diff --git a/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h b/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h index b6764ed4a4f403e790c332598a737e8a743dd7b2..9c66bb9d207c13ec1968064aa9cb0525891c18b9 100644 --- a/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h @@ -25,7 +25,7 @@ public: String getName() const override { return "MsgPackRowOutputFormat"; } void write(const Columns & columns, size_t row_num) override; - void writeField(const IColumn &, const IDataType &, size_t) override {} + void writeField(const IColumn &, const ISerialization &, size_t) override {} void serializeField(const IColumn & column, DataTypePtr data_type, size_t row_num); private: diff --git a/src/Processors/Formats/Impl/MySQLOutputFormat.cpp b/src/Processors/Formats/Impl/MySQLOutputFormat.cpp index f40261b45617ea4aef1bb12b39ac398373b55492..9733d479a77b14aff561ae0d4e0288487a67a667 100644 --- a/src/Processors/Formats/Impl/MySQLOutputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLOutputFormat.cpp @@ -26,6 +26,10 @@ void MySQLOutputFormat::initialize() const auto & header = getPort(PortKind::Main).getHeader(); data_types = header.getDataTypes(); + serializations.reserve(data_types.size()); + for (const auto & type : data_types) + serializations.emplace_back(type->getDefaultSerialization()); + if (header.columns()) { packet_endpoint->sendPacket(LengthEncodedNumber(header.columns())); @@ -51,7 +55,7 @@ void MySQLOutputFormat::consume(Chunk chunk) for (size_t i = 0; i < chunk.getNumRows(); i++) { - ProtocolText::ResultSetRow row_packet(data_types, chunk.getColumns(), i); + ProtocolText::ResultSetRow row_packet(serializations, chunk.getColumns(), i); packet_endpoint->sendPacket(row_packet); } } diff --git a/src/Processors/Formats/Impl/MySQLOutputFormat.h b/src/Processors/Formats/Impl/MySQLOutputFormat.h index c030067524004229d64bce628157f7454ba5dbfa..c47bbaadc33b49a270bce4aa21bab011001b5031 100644 --- a/src/Processors/Formats/Impl/MySQLOutputFormat.h +++ b/src/Processors/Formats/Impl/MySQLOutputFormat.h @@ -47,6 +47,7 @@ private: std::unique_ptr packet_endpoint; FormatSettings format_settings; DataTypes data_types; + Serializations serializations; }; } diff --git a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp index 3dd72a7a5c79f5faf19c15b5c6956537db757891..7a14966e220721f6bd388100f4e294025153df3a 100644 --- a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp @@ -23,7 +23,7 @@ static void writeODBCString(WriteBuffer & out, const std::string & str) out.write(str.data(), str.size()); } -void ODBCDriver2BlockOutputFormat::writeRow(const Block & header, const Columns & columns, size_t row_idx, std::string & buffer) +void ODBCDriver2BlockOutputFormat::writeRow(const Serializations & serializations, const Columns & columns, size_t row_idx, std::string & buffer) { size_t num_columns = columns.size(); for (size_t column_idx = 0; column_idx < num_columns; ++column_idx) @@ -39,7 +39,7 @@ void ODBCDriver2BlockOutputFormat::writeRow(const Block & header, const Columns { { WriteBufferFromString text_out(buffer); - header.getByPosition(column_idx).type->serializeAsText(*column, row_idx, text_out, format_settings); + serializations[column_idx]->serializeText(*column, row_idx, text_out, format_settings); } writeODBCString(out, buffer); } @@ -51,9 +51,15 @@ void ODBCDriver2BlockOutputFormat::write(Chunk chunk, PortKind port_kind) String text_value; const auto & header = getPort(port_kind).getHeader(); const auto & columns = chunk.getColumns(); + + size_t num_columns = columns.size(); + Serializations serializations(num_columns); + for (size_t i = 0; i < num_columns; ++i) + serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); + const size_t rows = chunk.getNumRows(); for (size_t i = 0; i < rows; ++i) - writeRow(header, columns, i, text_value); + writeRow(serializations, columns, i, text_value); } void ODBCDriver2BlockOutputFormat::consume(Chunk chunk) diff --git a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h index 2a20c7a061921226d3490a3a61deda40e3a626f5..4545e429cc24429efae670306639220f50a6db7d 100644 --- a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h @@ -45,7 +45,7 @@ private: prefix_written = true; } - void writeRow(const Block & header, const Columns & columns, size_t row_idx, std::string & buffer); + void writeRow(const Serializations & serializations, const Columns & columns, size_t row_idx, std::string & buffer); void write(Chunk chunk, PortKind port_kind); void writePrefix(); }; diff --git a/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp b/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp index 50b3def929e34e938cfcfccca62a9259e184b834..8c4da279fc5c3e7283b0869666f11cd85f87b0aa 100644 --- a/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PostgreSQLOutputFormat.cpp @@ -18,7 +18,7 @@ void PostgreSQLOutputFormat::doWritePrefix() initialized = true; const auto & header = getPort(PortKind::Main).getHeader(); - data_types = header.getDataTypes(); + auto data_types = header.getDataTypes(); if (header.columns()) { @@ -29,6 +29,7 @@ void PostgreSQLOutputFormat::doWritePrefix() { const auto & column_name = header.getColumnsWithTypeAndName()[i].name; columns.emplace_back(column_name, data_types[i]->getTypeId()); + serializations.emplace_back(data_types[i]->getDefaultSerialization()); } message_transport.send(PostgreSQLProtocol::Messaging::RowDescription(columns)); } @@ -51,7 +52,7 @@ void PostgreSQLOutputFormat::consume(Chunk chunk) else { WriteBufferFromOwnString ostr; - data_types[j]->serializeAsText(*columns[j], i, ostr, format_settings); + serializations[j]->serializeText(*columns[j], i, ostr, format_settings); row.push_back(std::make_shared(std::move(ostr.str()))); } } diff --git a/src/Processors/Formats/Impl/PostgreSQLOutputFormat.h b/src/Processors/Formats/Impl/PostgreSQLOutputFormat.h index 8ff5aae5067b19eb0918a19fbc0d295463a6a568..257fbdff34177c53b987e8ac419f7a716b2a8f4d 100644 --- a/src/Processors/Formats/Impl/PostgreSQLOutputFormat.h +++ b/src/Processors/Formats/Impl/PostgreSQLOutputFormat.h @@ -27,7 +27,7 @@ private: FormatSettings format_settings; PostgreSQLProtocol::Messaging::MessageTransport message_transport; - DataTypes data_types; + Serializations serializations; }; } diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp index 8bd4d36532d167c6eb6c38c5e377b37c8cc57976..0825d9f329e2072895dbdb9842bd34c9394fb3f1 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp @@ -59,7 +59,8 @@ void PrettyBlockOutputFormat::calculateWidths( { { WriteBufferFromString out_serialize(serialized_value); - elem.type->serializeAsText(*column, j, out_serialize, format_settings); + auto serialization = elem.type->getDefaultSerialization(); + serialization->serializeText(*column, j, out_serialize, format_settings); } /// Avoid calculating width of too long strings by limiting the size in bytes. @@ -154,6 +155,10 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) const auto & columns = chunk.getColumns(); const auto & header = getPort(port_kind).getHeader(); + Serializations serializations(num_columns); + for (size_t i = 0; i < num_columns; ++i) + serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); + WidthsPerColumn widths; Widths max_widths; Widths name_widths; @@ -290,11 +295,10 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) { if (j != 0) writeCString(grid_symbols.bar, out); - const auto & type = *header.getByPosition(j).type; - writeValueWithPadding(*columns[j], type, i, + writeValueWithPadding(*columns[j], *serializations[j], i, widths[j].empty() ? max_widths[j] : widths[j][i], - max_widths[j]); + max_widths[j], type.shouldAlignRightInPrettyFormats()); } writeCString(grid_symbols.bar, out); @@ -313,12 +317,13 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) void PrettyBlockOutputFormat::writeValueWithPadding( - const IColumn & column, const IDataType & type, size_t row_num, size_t value_width, size_t pad_to_width) + const IColumn & column, const ISerialization & serialization, size_t row_num, + size_t value_width, size_t pad_to_width, bool align_right) { String serialized_value = " "; { WriteBufferFromString out_serialize(serialized_value, WriteBufferFromString::AppendModeTag()); - type.serializeAsText(column, row_num, out_serialize, format_settings); + serialization.serializeText(column, row_num, out_serialize, format_settings); } if (value_width > format_settings.pretty.max_value_width) @@ -348,7 +353,7 @@ void PrettyBlockOutputFormat::writeValueWithPadding( writeChar(' ', out); }; - if (type.shouldAlignRightInPrettyFormats()) + if (align_right) { write_padding(); out.write(serialized_value.data(), serialized_value.size()); diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h index de79fe5ee2ab2f9fce65b8c3bb43638a05380fd8..02b438d25717ea2d427ce62b6ab5eaf12420739d 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h @@ -57,7 +57,8 @@ protected: WidthsPerColumn & widths, Widths & max_padded_widths, Widths & name_widths); void writeValueWithPadding( - const IColumn & column, const IDataType & type, size_t row_num, size_t value_width, size_t pad_to_width); + const IColumn & column, const ISerialization & serialization, size_t row_num, + size_t value_width, size_t pad_to_width, bool align_right); }; } diff --git a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp index cfa669ae8add8ff516df2ca1f27912947a0e7a60..c4902ea4c26831ee99d7e78bf2a3fdb465fdf2d6 100644 --- a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp @@ -149,6 +149,7 @@ void PrettyCompactBlockOutputFormat::writeBottom(const Widths & max_widths) void PrettyCompactBlockOutputFormat::writeRow( size_t row_num, const Block & header, + const Serializations & serializations, const Columns & columns, const WidthsPerColumn & widths, const Widths & max_widths) @@ -179,7 +180,7 @@ void PrettyCompactBlockOutputFormat::writeRow( const auto & type = *header.getByPosition(j).type; const auto & cur_widths = widths[j].empty() ? max_widths[j] : widths[j][row_num]; - writeValueWithPadding(*columns[j], type, row_num, cur_widths, max_widths[j]); + writeValueWithPadding(*columns[j], *serializations[j], row_num, cur_widths, max_widths[j], type.shouldAlignRightInPrettyFormats()); } writeCString(grid_symbols.bar, out); @@ -240,8 +241,13 @@ void PrettyCompactBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind po writeHeader(header, max_widths, name_widths); + size_t num_columns = header.columns(); + Serializations serializations(num_columns); + for (size_t i = 0; i < num_columns; ++i) + serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); + for (size_t i = 0; i < num_rows && total_rows + i < max_rows; ++i) - writeRow(i, header, columns, widths, max_widths); + writeRow(i, header, serializations, columns, widths, max_widths); writeBottom(max_widths); diff --git a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h index 90c9d3751928f286f35e0e849c2bf5060444a245..96344397a0c3c3308085178d84406b025165b6ef 100644 --- a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h @@ -23,6 +23,7 @@ protected: void writeRow( size_t row_num, const Block & header, + const Serializations & serializations, const Columns & columns, const WidthsPerColumn & widths, const Widths & max_widths); diff --git a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp index f3fb27a5558dd8aeb856edc0ecb8ad8a9d4c3121..fa987c6b9496ca3524f9f6d31ec369196dab0227 100644 --- a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp @@ -24,6 +24,10 @@ void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind const auto & header = getPort(port_kind).getHeader(); const auto & columns = chunk.getColumns(); + Serializations serializations(num_columns); + for (size_t i = 0; i < num_columns; ++i) + serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); + WidthsPerColumn widths; Widths max_widths; Widths name_widths; @@ -87,7 +91,8 @@ void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind const auto & type = *header.getByPosition(column).type; auto & cur_width = widths[column].empty() ? max_widths[column] : widths[column][row]; - writeValueWithPadding(*columns[column], type, row, cur_width, max_widths[column]); + writeValueWithPadding(*columns[column], *serializations[column], + row, cur_width, max_widths[column], type.shouldAlignRightInPrettyFormats()); } writeChar('\n', out); diff --git a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h index 5f82950e891a4470320405167987a823194abe96..54324490a3bc0d18d222c6ff282346cae32a1ab9 100644 --- a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h @@ -42,7 +42,7 @@ public: String getName() const override { return "ProtobufRowOutputFormat"; } void write(const Columns & columns, size_t row_num) override; - void writeField(const IColumn &, const IDataType &, size_t) override {} + void writeField(const IColumn &, const ISerialization &, size_t) override {} std::string getContentType() const override { return "application/octet-stream"; } private: diff --git a/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp index bcee94d8ad5494cc108417541f97fa046c63c694..49f1159d48de34aa6ce929553fa34915c8b0a8f8 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp @@ -15,7 +15,7 @@ RawBLOBRowOutputFormat::RawBLOBRowOutputFormat( } -void RawBLOBRowOutputFormat::writeField(const IColumn & column, const IDataType &, size_t row_num) +void RawBLOBRowOutputFormat::writeField(const IColumn & column, const ISerialization &, size_t row_num) { StringRef value = column.getDataAt(row_num); out.write(value.data, value.size); diff --git a/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h index 6a9a70bb12f861654b557ac4960407cbf8c8bbc6..7a29c62e4d836b0a29346841f959aba363dbe26e 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h +++ b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h @@ -34,7 +34,7 @@ public: String getName() const override { return "RawBLOBRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType &, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization &, size_t row_num) override; }; } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 108f4d9d32136ecc702a843b44bb795652fbb4e9..555c79f80641cc0e56c407de7e797b252e559f15 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include namespace DB @@ -65,37 +65,38 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); try { + const auto & serialization = serializations[index]; switch (field_format) { case ColumnFormat::Escaped: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextEscaped(*columns[index], field_buf, format_settings, type); + read = SerializationNullable::deserializeTextEscapedImpl(*columns[index], field_buf, format_settings, serialization); else - type->deserializeAsTextEscaped(*columns[index], field_buf, format_settings); + serialization->deserializeTextEscaped(*columns[index], field_buf, format_settings); break; case ColumnFormat::Quoted: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextQuoted(*columns[index], field_buf, format_settings, type); + read = SerializationNullable::deserializeTextQuotedImpl(*columns[index], field_buf, format_settings, serialization); else - type->deserializeAsTextQuoted(*columns[index], field_buf, format_settings); + serialization->deserializeTextQuoted(*columns[index], field_buf, format_settings); break; case ColumnFormat::Csv: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextCSV(*columns[index], field_buf, format_settings, type); + read = SerializationNullable::deserializeTextCSVImpl(*columns[index], field_buf, format_settings, serialization); else - type->deserializeAsTextCSV(*columns[index], field_buf, format_settings); + serialization->deserializeTextCSV(*columns[index], field_buf, format_settings); break; case ColumnFormat::Json: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextJSON(*columns[index], field_buf, format_settings, type); + read = SerializationNullable::deserializeTextJSONImpl(*columns[index], field_buf, format_settings, serialization); else - type->deserializeAsTextJSON(*columns[index], field_buf, format_settings); + serialization->deserializeTextJSON(*columns[index], field_buf, format_settings); break; case ColumnFormat::Raw: if (parse_as_nullable) - read = DataTypeNullable::deserializeWholeText(*columns[index], field_buf, format_settings, type); + read = SerializationNullable::deserializeWholeTextImpl(*columns[index], field_buf, format_settings, serialization); else - type->deserializeAsWholeText(*columns[index], field_buf, format_settings); + serialization->deserializeWholeText(*columns[index], field_buf, format_settings); break; default: break; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 8d769cab346f145930e6fbe09ff628088f05cf16..ee6fce833583c973b5bffda2edbc9690947170ff 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include namespace DB @@ -142,10 +142,11 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex seen_columns[index] = read_columns[index] = true; const auto & type = getPort().getHeader().getByPosition(index).type; + const auto & serialization = serializations[index]; if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = DataTypeNullable::deserializeTextEscaped(*columns[index], in, format_settings, type); + read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], in, format_settings, serialization); else - header.getByPosition(index).type->deserializeAsTextEscaped(*columns[index], in, format_settings); + serialization->deserializeTextEscaped(*columns[index], in, format_settings); } } else diff --git a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp index 149ba3f0a2a5f0e6df99a7b9035a7314d7dc587b..627ae67fa31a0f3590842729266ceb17ae9972d0 100644 --- a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp @@ -24,10 +24,10 @@ TSKVRowOutputFormat::TSKVRowOutputFormat(WriteBuffer & out_, const Block & heade } -void TSKVRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void TSKVRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { writeString(fields[field_number].name, out); - type.serializeAsTextEscaped(column, row_num, out, format_settings); + serialization.serializeTextEscaped(column, row_num, out, format_settings); ++field_number; } diff --git a/src/Processors/Formats/Impl/TSKVRowOutputFormat.h b/src/Processors/Formats/Impl/TSKVRowOutputFormat.h index 1b341cbbc72f0126c2bb9aa1394eac6de6dbae8a..24c4e5ca866be93bd53df93500579f7f5308434f 100644 --- a/src/Processors/Formats/Impl/TSKVRowOutputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowOutputFormat.h @@ -18,7 +18,7 @@ public: String getName() const override { return "TSKVRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeRowEndDelimiter() override; protected: diff --git a/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h index bbcfec8e6da4e23c0cf68882dbf52ac25dd1fdd6..07c8edf9e6ec5b9d6d2c626b8077313f7ee2a4a7 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h @@ -31,7 +31,7 @@ public: String getName() const override { return "TabSeparatedRawRowInputFormat"; } - bool readField(IColumn & column, const DataTypePtr & type, bool) override + bool readField(IColumn & column, const DataTypePtr &, const SerializationPtr & serialization, bool) override { String tmp; @@ -49,8 +49,7 @@ public: } ReadBufferFromString cell(tmp); - - type->deserializeAsWholeText(column, cell, format_settings); + serialization->deserializeWholeText(column, cell, format_settings); return true; } diff --git a/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h index 6aa7f7bdfadb6342e19db398a40e2d1ad22bca0c..dc9312e53bcac20a642bc12a248c8d076325f3b1 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h @@ -26,9 +26,9 @@ public: String getName() const override { return "TabSeparatedRawRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override { - type.serializeAsText(column, row_num, out, format_settings); + serialization.serializeText(column, row_num, out, format_settings); } }; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index ffb1b96f70e32949bb21e87e355fe9f0fde2ea6e..41adb6fc612f33cf234d96c66c56c2e42f8465fe 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include namespace DB { @@ -195,7 +195,7 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens if (column_index) { const auto & type = data_types[*column_index]; - ext.read_columns[*column_index] = readField(*columns[*column_index], type, is_last_file_column); + ext.read_columns[*column_index] = readField(*columns[*column_index], type, serializations[*column_index], is_last_file_column); } else { @@ -223,18 +223,21 @@ bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtens } -bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column) +bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, + const SerializationPtr & serialization, bool is_last_file_column) { const bool at_delimiter = !is_last_file_column && !in.eof() && *in.position() == '\t'; const bool at_last_column_line_end = is_last_file_column && (in.eof() || *in.position() == '\n'); + if (format_settings.tsv.empty_as_default && (at_delimiter || at_last_column_line_end)) { column.insertDefault(); return false; } else if (format_settings.null_as_default && !type->isNullable()) - return DataTypeNullable::deserializeTextEscaped(column, in, format_settings, type); - type->deserializeAsTextEscaped(column, in, format_settings); + return SerializationNullable::deserializeTextEscapedImpl(column, in, format_settings, serialization); + + serialization->deserializeTextEscaped(column, in, format_settings); return true; } @@ -332,7 +335,8 @@ bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { - if (column_mapping->column_indexes_for_input_fields[file_column]) + const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; + if (index) { // check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case if (!type->isNullable() && !in.eof()) @@ -351,8 +355,9 @@ void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, I } } } + const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, is_last_file_column); + readField(column, type, serializations[*index], is_last_file_column); } else { diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index db70b4d3fea6999b995c018c1d85ce17f75b8bb4..8127b5ceba75e368e1971d04525fd0bdfb7c8ccc 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -33,7 +33,8 @@ protected: bool with_types; const FormatSettings format_settings; - virtual bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column); + virtual bool readField(IColumn & column, const DataTypePtr & type, + const SerializationPtr & serialization, bool is_last_file_column); private: DataTypes data_types; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index dd3adfa40ebf75745be2d0ee7da045bc303c437f..3e99264785e33c5dfe8b5abf66bcec2a1871593b 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -43,9 +43,9 @@ void TabSeparatedRowOutputFormat::doWritePrefix() } -void TabSeparatedRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void TabSeparatedRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - type.serializeAsTextEscaped(column, row_num, out, format_settings); + serialization.serializeTextEscaped(column, row_num, out, format_settings); } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h index 7985d6a1c86dde236fdbb64f1c7fac3e9854f901..e3190be70e84b442fed5d29ee07417620e3e27a9 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h @@ -28,7 +28,7 @@ public: String getName() const override { return "TabSeparatedRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowEndDelimiter() override; void writeBeforeTotals() override; diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 6e33c7d90c9d9210c6220c00ffe5f95307e14065..d65f6dd9e38705aed45f0c84dad2e21dca84c96b 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -21,9 +21,9 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ { const auto & sample = getPort(PortKind::Main).getHeader(); size_t columns = sample.columns(); - types.resize(columns); + serializations.resize(columns); for (size_t i = 0; i < columns; ++i) - types[i] = sample.safeGetByPosition(i).type; + serializations[i] = sample.safeGetByPosition(i).type->getDefaultSerialization(); /// Validate format string for whole output size_t data_idx = format.format_idx_to_column_idx.size() + 1; @@ -105,32 +105,32 @@ void TemplateBlockOutputFormat::writeRow(const Chunk & chunk, size_t row_num) writeString(row_format.delimiters[j], out); size_t col_idx = *row_format.format_idx_to_column_idx[j]; - serializeField(*chunk.getColumns()[col_idx], *types[col_idx], row_num, row_format.formats[j]); + serializeField(*chunk.getColumns()[col_idx], *serializations[col_idx], row_num, row_format.formats[j]); } writeString(row_format.delimiters[columns], out); } -void TemplateBlockOutputFormat::serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat col_format) +void TemplateBlockOutputFormat::serializeField(const IColumn & column, const ISerialization & serialization, size_t row_num, ColumnFormat col_format) { switch (col_format) { case ColumnFormat::Escaped: - type.serializeAsTextEscaped(column, row_num, out, settings); + serialization.serializeTextEscaped(column, row_num, out, settings); break; case ColumnFormat::Quoted: - type.serializeAsTextQuoted(column, row_num, out, settings); + serialization.serializeTextQuoted(column, row_num, out, settings); break; case ColumnFormat::Csv: - type.serializeAsTextCSV(column, row_num, out, settings); + serialization.serializeTextCSV(column, row_num, out, settings); break; case ColumnFormat::Json: - type.serializeAsTextJSON(column, row_num, out, settings); + serialization.serializeTextJSON(column, row_num, out, settings); break; case ColumnFormat::Xml: - type.serializeAsTextXML(column, row_num, out, settings); + serialization.serializeTextXML(column, row_num, out, settings); break; case ColumnFormat::Raw: - type.serializeAsText(column, row_num, out, settings); + serialization.serializeText(column, row_num, out, settings); break; default: __builtin_unreachable(); @@ -142,7 +142,7 @@ template void TemplateBlockOutputFormat::writeValue(U v auto type = std::make_unique(); auto col = type->createColumn(); col->insert(value); - serializeField(*col, *type, 0, col_format); + serializeField(*col, *type->getDefaultSerialization(), 0, col_format); } void TemplateBlockOutputFormat::consume(Chunk chunk) diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index f29d31eb3f10feb5ddc2ca653d40260239844860..0d41b8888d4e1200e3448abb0ed2e91dd2e6c4d9 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -47,12 +47,12 @@ protected: void finalize() override; void writeRow(const Chunk & chunk, size_t row_num); - void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format); + void serializeField(const IColumn & column, const ISerialization & serialization, size_t row_num, ColumnFormat format); template void writeValue(U value, ColumnFormat col_format); protected: const FormatSettings settings; - DataTypes types; + Serializations serializations; ParsedTemplateFormatString format; ParsedTemplateFormatString row_format; diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 6023b38e4dee1278e0cc26e70ced061af52da34d..0e5a962a037a847099ab7682b334d7c82a8ab991 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB { @@ -173,7 +173,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension if (row_format.format_idx_to_column_idx[i]) { size_t col_idx = *row_format.format_idx_to_column_idx[i]; - extra.read_columns[col_idx] = deserializeField(data_types[col_idx], *columns[col_idx], i); + extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i); } else skipField(row_format.formats[i]); @@ -189,7 +189,8 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension return true; } -bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, + const SerializationPtr & serialization, IColumn & column, size_t file_column) { ColumnFormat col_format = row_format.formats[file_column]; bool read = true; @@ -200,30 +201,30 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, IColumn { case ColumnFormat::Escaped: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextEscaped(column, buf, settings, type); + read = SerializationNullable::deserializeTextEscapedImpl(column, buf, settings, serialization); else - type->deserializeAsTextEscaped(column, buf, settings); + serialization->deserializeTextEscaped(column, buf, settings); break; case ColumnFormat::Quoted: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextQuoted(column, buf, settings, type); + read = SerializationNullable::deserializeTextQuotedImpl(column, buf, settings, serialization); else - type->deserializeAsTextQuoted(column, buf, settings); + serialization->deserializeTextQuoted(column, buf, settings); break; case ColumnFormat::Csv: /// Will read unquoted string until settings.csv.delimiter settings.csv.delimiter = row_format.delimiters[file_column + 1].empty() ? default_csv_delimiter : row_format.delimiters[file_column + 1].front(); if (parse_as_nullable) - read = DataTypeNullable::deserializeTextCSV(column, buf, settings, type); + read = SerializationNullable::deserializeTextCSVImpl(column, buf, settings, serialization); else - type->deserializeAsTextCSV(column, buf, settings); + serialization->deserializeTextCSV(column, buf, settings); break; case ColumnFormat::Json: if (parse_as_nullable) - read = DataTypeNullable::deserializeTextJSON(column, buf, settings, type); + read = SerializationNullable::deserializeTextJSONImpl(column, buf, settings, serialization); else - type->deserializeAsTextJSON(column, buf, settings); + serialization->deserializeTextJSON(column, buf, settings); break; default: __builtin_unreachable(); @@ -412,8 +413,9 @@ void TemplateRowInputFormat::writeErrorStringForWrongDelimiter(WriteBuffer & out void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) { - if (row_format.format_idx_to_column_idx[file_column]) - deserializeField(type, column, file_column); + const auto & index = row_format.format_idx_to_column_idx[file_column]; + if (index) + deserializeField(type, serializations[*index], column, file_column); else skipField(row_format.formats[file_column]); } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 6adfe0a34b449eb410ab510b717722b121be3eb0..322f8570ab72600f3baba13e07628893ccd97c4b 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -32,7 +32,9 @@ public: void resetParser() override; private: - bool deserializeField(const DataTypePtr & type, IColumn & column, size_t file_column); + bool deserializeField(const DataTypePtr & type, + const SerializationPtr & serialization, IColumn & column, size_t file_column); + void skipField(ColumnFormat col_format); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(buf); } @@ -43,6 +45,7 @@ private: bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + bool isGarbageAfterField(size_t after_col_idx, ReadBuffer::Position pos) override; void writeErrorStringForWrongDelimiter(WriteBuffer & out, const String & description, const String & delim); diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 1455b8f674028d89404d9fd3809a65ee4b25c9c2..c054145016d3696941e8feee73931c8123ed7ede 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,9 @@ ValuesBlockInputFormat::ValuesBlockInputFormat(ReadBuffer & in_, const Block & h attempts_to_deduce_template(num_columns), attempts_to_deduce_template_cached(num_columns), rows_parsed_using_template(num_columns), templates(num_columns), types(header_.getDataTypes()) { + serializations.resize(types.size()); + for (size_t i = 0; i < types.size(); ++i) + serializations[i] = types[i]->getDefaultSerialization(); } Chunk ValuesBlockInputFormat::generate() @@ -164,10 +168,12 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) { bool read = true; const auto & type = types[column_idx]; + const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !type->isNullable()) - read = DataTypeNullable::deserializeTextQuoted(column, buf, format_settings, type); + read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); else - type->deserializeAsTextQuoted(column, buf, format_settings); + serialization->deserializeTextQuoted(column, buf, format_settings); + rollback_on_exception = true; skipWhitespaceIfAny(buf); @@ -310,7 +316,8 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx bool ok = false; try { - header.getByPosition(column_idx).type->deserializeAsTextQuoted(column, buf, format_settings); + const auto & serialization = serializations[column_idx]; + serialization->deserializeTextQuoted(column, buf, format_settings); rollback_on_exception = true; skipWhitespaceIfAny(buf); if (checkDelimiterAfterValue(column_idx)) diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index a541870e48401491aeb153fc5570ed1daeaeb4b8..8e7e15c572d3f9619a57c314d6b4e93336ac8f2f 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -89,6 +89,7 @@ private: ConstantExpressionTemplate::Cache templates_cache; const DataTypes types; + Serializations serializations; BlockMissingValues block_missing_values; }; diff --git a/src/Processors/Formats/Impl/ValuesRowOutputFormat.cpp b/src/Processors/Formats/Impl/ValuesRowOutputFormat.cpp index 7791e1296e074106a9343cf2b6bb75590f9a2ef5..e0152a7ffeec06a266d618a6c20121a790ac162e 100644 --- a/src/Processors/Formats/Impl/ValuesRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesRowOutputFormat.cpp @@ -15,9 +15,9 @@ ValuesRowOutputFormat::ValuesRowOutputFormat(WriteBuffer & out_, const Block & h { } -void ValuesRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void ValuesRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - type.serializeAsTextQuoted(column, row_num, out, format_settings); + serialization.serializeTextQuoted(column, row_num, out, format_settings); } void ValuesRowOutputFormat::writeFieldDelimiter() diff --git a/src/Processors/Formats/Impl/ValuesRowOutputFormat.h b/src/Processors/Formats/Impl/ValuesRowOutputFormat.h index 73f91866f4379d21132ca75410e308d176a10d50..493ce458b1ee2c10e048aa1d73af88d788dcfc1d 100644 --- a/src/Processors/Formats/Impl/ValuesRowOutputFormat.h +++ b/src/Processors/Formats/Impl/ValuesRowOutputFormat.h @@ -19,7 +19,7 @@ public: String getName() const override { return "ValuesRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; void writeRowStartDelimiter() override; void writeRowEndDelimiter() override; diff --git a/src/Processors/Formats/Impl/VerticalRowOutputFormat.cpp b/src/Processors/Formats/Impl/VerticalRowOutputFormat.cpp index a3c71cbde59010482b9eb243ce5e78bc2cdd62f9..c6f37d270b0f52d3a6cdf1809f4b60aa24b4651f 100644 --- a/src/Processors/Formats/Impl/VerticalRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/VerticalRowOutputFormat.cpp @@ -50,22 +50,22 @@ VerticalRowOutputFormat::VerticalRowOutputFormat( } -void VerticalRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void VerticalRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { if (row_number > format_settings.pretty.max_rows) return; writeString(names_and_paddings[field_number], out); - writeValue(column, type, row_num); + writeValue(column, serialization, row_num); writeChar('\n', out); ++field_number; } -void VerticalRowOutputFormat::writeValue(const IColumn & column, const IDataType & type, size_t row_num) const +void VerticalRowOutputFormat::writeValue(const IColumn & column, const ISerialization & serialization, size_t row_num) const { - type.serializeAsText(column, row_num, out, format_settings); + serialization.serializeText(column, row_num, out, format_settings); } @@ -123,26 +123,25 @@ void VerticalRowOutputFormat::writeBeforeExtremes() void VerticalRowOutputFormat::writeMinExtreme(const Columns & columns, size_t row_num) { - writeSpecialRow(columns, row_num, PortKind::Totals, "Min"); + writeSpecialRow(columns, row_num, "Min"); } void VerticalRowOutputFormat::writeMaxExtreme(const Columns & columns, size_t row_num) { - writeSpecialRow(columns, row_num, PortKind::Totals, "Max"); + writeSpecialRow(columns, row_num, "Max"); } void VerticalRowOutputFormat::writeTotals(const Columns & columns, size_t row_num) { - writeSpecialRow(columns, row_num, PortKind::Totals, "Totals"); + writeSpecialRow(columns, row_num, "Totals"); was_totals_written = true; } -void VerticalRowOutputFormat::writeSpecialRow(const Columns & columns, size_t row_num, PortKind port_kind, const char * title) +void VerticalRowOutputFormat::writeSpecialRow(const Columns & columns, size_t row_num, const char * title) { row_number = 0; field_number = 0; - const auto & header = getPort(port_kind).getHeader(); size_t num_columns = columns.size(); writeCString(title, out); @@ -158,8 +157,7 @@ void VerticalRowOutputFormat::writeSpecialRow(const Columns & columns, size_t ro if (i != 0) writeFieldDelimiter(); - const auto & col = header.getByPosition(i); - writeField(*columns[i], *col.type, row_num); + writeField(*columns[i], *serializations[i], row_num); } } diff --git a/src/Processors/Formats/Impl/VerticalRowOutputFormat.h b/src/Processors/Formats/Impl/VerticalRowOutputFormat.h index d372f5f611a0872197a592bad1f662507bbf8dde..9e89f677f87ba79826728436a588518f39a572ec 100644 --- a/src/Processors/Formats/Impl/VerticalRowOutputFormat.h +++ b/src/Processors/Formats/Impl/VerticalRowOutputFormat.h @@ -22,7 +22,7 @@ public: String getName() const override { return "VerticalRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeRowStartDelimiter() override; void writeRowBetweenDelimiter() override; void writeSuffix() override; @@ -35,10 +35,10 @@ public: void writeBeforeExtremes() override; protected: - virtual void writeValue(const IColumn & column, const IDataType & type, size_t row_num) const; + virtual void writeValue(const IColumn & column, const ISerialization & serialization, size_t row_num) const; /// For totals and extremes. - void writeSpecialRow(const Columns & columns, size_t row_num, PortKind port_kind, const char * title); + void writeSpecialRow(const Columns & columns, size_t row_num, const char * title); const FormatSettings format_settings; size_t field_number = 0; diff --git a/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp b/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp index 6fd63a18147dee884fb770f43a47c34f1d02cb25..893c4e229c72e48b713b44cb145653e5e832ad8c 100644 --- a/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp @@ -82,12 +82,12 @@ void XMLRowOutputFormat::writePrefix() } -void XMLRowOutputFormat::writeField(const IColumn & column, const IDataType & type, size_t row_num) +void XMLRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { writeCString("\t\t\t<", *ostr); writeString(field_tag_names[field_number], *ostr); writeCString(">", *ostr); - type.serializeAsTextXML(column, row_num, *ostr, format_settings); + serialization.serializeTextXML(column, row_num, *ostr, format_settings); writeCString("\n", *ostr); @@ -132,7 +132,7 @@ void XMLRowOutputFormat::writeTotals(const Columns & columns, size_t row_num) writeCString("\t\t<", *ostr); writeString(field_tag_names[i], *ostr); writeCString(">", *ostr); - column.type->serializeAsTextXML(*columns[i], row_num, *ostr, format_settings); + column.type->getDefaultSerialization()->serializeTextXML(*columns[i], row_num, *ostr, format_settings); writeCString("\n", *ostr); @@ -181,7 +181,7 @@ void XMLRowOutputFormat::writeExtremesElement(const char * title, const Columns writeCString("\t\t\t<", *ostr); writeString(field_tag_names[i], *ostr); writeCString(">", *ostr); - column.type->serializeAsTextXML(*columns[i], row_num, *ostr, format_settings); + column.type->getDefaultSerialization()->serializeTextXML(*columns[i], row_num, *ostr, format_settings); writeCString("\n", *ostr); diff --git a/src/Processors/Formats/Impl/XMLRowOutputFormat.h b/src/Processors/Formats/Impl/XMLRowOutputFormat.h index 233ee773c1c57b6a33a25147366415daff40dfc5..8ca4721c4598b4427dda1fda93cd039f90ae282a 100644 --- a/src/Processors/Formats/Impl/XMLRowOutputFormat.h +++ b/src/Processors/Formats/Impl/XMLRowOutputFormat.h @@ -20,7 +20,7 @@ public: String getName() const override { return "XMLRowOutputFormat"; } - void writeField(const IColumn & column, const IDataType & type, size_t row_num) override; + void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeRowStartDelimiter() override; void writeRowEndDelimiter() override; void writePrefix() override; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 3d15681a27e011089738f37a9757919929e08ff7..7c9f7b8104d1fa3cebae98937da3fb23d15ba896 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -71,12 +71,12 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Dis { String file_name = part_path + "minmax_" + escapeForFileName(minmax_column_names[i]) + ".idx"; auto file = openForReading(disk_, file_name); - const DataTypePtr & data_type = minmax_column_types[i]; + auto serialization = minmax_column_types[i]->getDefaultSerialization(); Field min_val; - data_type->deserializeBinary(min_val, *file); + serialization->deserializeBinary(min_val, *file); Field max_val; - data_type->deserializeBinary(max_val, *file); + serialization->deserializeBinary(max_val, *file); hyperrectangle.emplace_back(min_val, true, max_val, true); } @@ -109,12 +109,12 @@ void IMergeTreeDataPart::MinMaxIndex::store( for (size_t i = 0; i < column_names.size(); ++i) { String file_name = "minmax_" + escapeForFileName(column_names[i]) + ".idx"; - const DataTypePtr & data_type = data_types.at(i); + auto serialization = data_types.at(i)->getDefaultSerialization(); auto out = disk_->writeFile(part_path + file_name); HashingWriteBuffer out_hashing(*out); - data_type->serializeBinary(hyperrectangle[i].left, out_hashing); - data_type->serializeBinary(hyperrectangle[i].right, out_hashing); + serialization->serializeBinary(hyperrectangle[i].left, out_hashing); + serialization->serializeBinary(hyperrectangle[i].right, out_hashing); out_hashing.next(); out_checksums.files[file_name].file_size = out_hashing.count(); out_checksums.files[file_name].file_hash = out_hashing.getHash(); @@ -611,7 +611,7 @@ void IMergeTreeDataPart::loadIndex() for (size_t i = 0; i < marks_count; ++i) //-V756 for (size_t j = 0; j < key_size; ++j) - primary_key.data_types[j]->deserializeBinary(*loaded_index[j], *index_file); + primary_key.data_types[j]->getDefaultSerialization()->deserializeBinary(*loaded_index[j], *index_file); for (size_t i = 0; i < key_size; ++i) { @@ -702,12 +702,18 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const auto column_size = getColumnSize(part_column.name, *part_column.type); if (column_size.data_compressed != 0 && !storage_columns.hasCompressionCodec(part_column.name)) { + auto serialization = IDataType::getSerialization(part_column, + [&](const String & stream_name) + { + return volume->getDisk()->exists(stream_name + IMergeTreeDataPart::DATA_FILE_EXTENSION); + }); + String path_to_data_file; - part_column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { if (path_to_data_file.empty()) { - String candidate_path = getFullRelativePath() + IDataType::getFileNameForStream(part_column, substream_path) + ".bin"; + String candidate_path = getFullRelativePath() + ISerialization::getFileNameForStream(part_column, substream_path) + ".bin"; /// We can have existing, but empty .bin files. Example: LowCardinality(Nullable(...)) columns and column_name.dict.null.bin file. if (volume->getDisk()->exists(candidate_path) && volume->getDisk()->getFileSize(candidate_path) != 0) @@ -1319,6 +1325,15 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada return true; } +SerializationPtr IMergeTreeDataPart::getSerializationForColumn(const NameAndTypePair & column) const +{ + return IDataType::getSerialization(column, + [&](const String & stream_name) + { + return checksums.files.count(stream_name + DATA_FILE_EXTENSION) != 0; + }); +} + String IMergeTreeDataPart::getUniqueId() const { String id; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 92b05e5cbd2e18b8dcd7f0c13e04b7477c41bad6..b64022d2b5ae06f3fbbbbecde5e2a529db8e4751 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -16,7 +16,6 @@ #include #include #include -#include #include @@ -54,6 +53,8 @@ namespace ErrorCodes class IMergeTreeDataPart : public std::enable_shared_from_this { public: + static constexpr auto DATA_FILE_EXTENSION = ".bin"; + using Checksums = MergeTreeDataPartChecksums; using Checksum = MergeTreeDataPartChecksums::Checksum; using ValueSizeMap = std::map; @@ -62,7 +63,7 @@ public: using MergeTreeWriterPtr = std::unique_ptr; using ColumnSizeByName = std::unordered_map; - using NameToPosition = std::unordered_map; + using NameToNumber = std::unordered_map; using Type = MergeTreeDataPartType; @@ -365,6 +366,9 @@ public: /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; + /// Returns serialization for column according to files in which column is written in part. + SerializationPtr getSerializationForColumn(const NameAndTypePair & column) const; + /// Return some uniq string for file /// Required for distinguish different copies of the same part on S3 String getUniqueId() const; @@ -398,7 +402,7 @@ protected: private: /// In compact parts order of columns is necessary - NameToPosition column_name_to_position; + NameToNumber column_name_to_position; /// Reads part unique identifier (if exists) from uuid.txt void loadUUID(); diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index a80f2c8fe8c5673ff5c3fb484072e3ebdbd6296f..53ab471326754583e9b344eb2a093a7b4450f774 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -231,8 +231,9 @@ NameAndTypePair IMergeTreeReader::getColumnFromPart(const NameAndTypePair & requ { auto subcolumn_name = required_column.getSubcolumnName(); auto subcolumn_type = it->second->tryGetSubcolumnType(subcolumn_name); + if (!subcolumn_type) - subcolumn_type = required_column.type; + return required_column; return {it->first, subcolumn_name, it->second, subcolumn_type}; } diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index d192339432f13ebaf6db816c5c8726d3960189a9..0771bc3d5cb670dbdfa5d6b88948db3ac8ce5a7a 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -16,7 +16,7 @@ class IMergeTreeReader : private boost::noncopyable { public: using ValueSizeMap = std::map; - using DeserializeBinaryBulkStateMap = std::map; + using DeserializeBinaryBulkStateMap = std::map; IMergeTreeReader( const MergeTreeData::DataPartPtr & data_part_, diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index 7e562ae03d63b20702b71f7b3725cded3b40bbab..e334cd486ef111978512e3fd9a7c5e412ff860d2 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -30,10 +30,11 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( std::map stream_counts; for (const NameAndTypePair & column : columns) { - column.type->enumerateStreams( - [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_path */) + auto serialization = data_part->getSerializationForColumn(column); + serialization->enumerateStreams( + [&](const ISerialization::SubstreamPath & substream_path) { - ++stream_counts[IDataType::getFileNameForStream(column, substream_path)]; + ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; }, {}); } @@ -46,9 +47,9 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( if (!column_with_type) continue; - IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_path */) + ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) { - String stream_name = IDataType::getFileNameForStream(*column_with_type, substream_path); + String stream_name = ISerialization::getFileNameForStream(*column_with_type, substream_path); /// Delete files if they are no longer shared with another column. if (--stream_counts[stream_name] == 0) { @@ -57,8 +58,8 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( } }; - IDataType::SubstreamPath stream_path; - column_with_type->type->enumerateStreams(callback, stream_path); + auto serialization = data_part->getSerializationForColumn(*column_with_type); + serialization->enumerateStreams(callback); } /// Remove files on disk and checksums diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.h b/src/Storages/MergeTree/IMergedBlockOutputStream.h index ed8da4d334ba3393c75471ba1cd0049af5245492..b2ad5309017723fd05a32f15731a002cecb69b0c 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.h +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.h @@ -24,9 +24,9 @@ public: } protected: - using SerializationState = IDataType::SerializeBinaryBulkStatePtr; + // using SerializationState = ISerialization::SerializeBinaryBulkStatePtr; - IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenOffsetColumns & offset_columns); + // ISerialization::OutputStreamGetter createStreamGetter(const String & name, WrittenOffsetColumns & offset_columns); /// Remove all columns marked expired in data_part. Also, clears checksums /// and columns array. Return set of removed files names. diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index afcb85c2479f34128402f448cf1d7be8847876fb..4269aa89ad1d8002ada772f92682faff700b3798 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -637,7 +637,7 @@ public: } }; -static bool needSyncPart(const size_t input_rows, size_t input_bytes, const MergeTreeSettings & settings) +static bool needSyncPart(size_t input_rows, size_t input_bytes, const MergeTreeSettings & settings) { return ((settings.min_rows_to_fsync_after_merge && input_rows >= settings.min_rows_to_fsync_after_merge) || (settings.min_compressed_bytes_to_fsync_after_merge && input_bytes >= settings.min_compressed_bytes_to_fsync_after_merge)); @@ -1489,10 +1489,11 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames( std::map stream_counts; for (const NameAndTypePair & column : source_part->getColumns()) { - column.type->enumerateStreams( - [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto serialization = source_part->getSerializationForColumn(column); + serialization->enumerateStreams( + [&](const ISerialization::SubstreamPath & substream_path) { - ++stream_counts[IDataType::getFileNameForStream(column, substream_path)]; + ++stream_counts[ISerialization::getFileNameForStream(column, substream_path)]; }, {}); } @@ -1508,9 +1509,9 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames( } else if (command.type == MutationCommand::Type::DROP_COLUMN) { - IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) { - String stream_name = IDataType::getFileNameForStream({command.column_name, command.data_type}, substream_path); + String stream_name = ISerialization::getFileNameForStream({command.column_name, command.data_type}, substream_path); /// Delete files if they are no longer shared with another column. if (--stream_counts[stream_name] == 0) { @@ -1519,19 +1520,21 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames( } }; - IDataType::SubstreamPath stream_path; auto column = source_part->getColumns().tryGetByName(command.column_name); if (column) - column->type->enumerateStreams(callback, stream_path); + { + auto serialization = source_part->getSerializationForColumn(*column); + serialization->enumerateStreams(callback); + } } else if (command.type == MutationCommand::Type::RENAME_COLUMN) { String escaped_name_from = escapeForFileName(command.column_name); String escaped_name_to = escapeForFileName(command.rename_to); - IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) { - String stream_from = IDataType::getFileNameForStream({command.column_name, command.data_type}, substream_path); + String stream_from = ISerialization::getFileNameForStream({command.column_name, command.data_type}, substream_path); String stream_to = boost::replace_first_copy(stream_from, escaped_name_from, escaped_name_to); @@ -1541,10 +1544,13 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames( rename_vector.emplace_back(stream_from + mrk_extension, stream_to + mrk_extension); } }; - IDataType::SubstreamPath stream_path; + auto column = source_part->getColumns().tryGetByName(command.column_name); if (column) - column->type->enumerateStreams(callback, stream_path); + { + auto serialization = source_part->getSerializationForColumn(*column); + serialization->enumerateStreams(callback); + } } } @@ -1562,15 +1568,15 @@ NameSet MergeTreeDataMergerMutator::collectFilesToSkip( /// Skip updated files for (const auto & entry : updated_header) { - IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) { - String stream_name = IDataType::getFileNameForStream({entry.name, entry.type}, substream_path); + String stream_name = ISerialization::getFileNameForStream({entry.name, entry.type}, substream_path); files_to_skip.insert(stream_name + ".bin"); files_to_skip.insert(stream_name + mrk_extension); }; - IDataType::SubstreamPath stream_path; - entry.type->enumerateStreams(callback, stream_path); + auto serialization = source_part->getSerializationForColumn({entry.name, entry.type}); + serialization->enumerateStreams(callback); } for (const auto & index : indices_to_recalc) { diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 2c0c4020bb08c76f82af827504eb820a01b3fa19..564d59c919819eff9d7105a75fc5c1e3a20cdb7f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -19,7 +19,6 @@ class MergeTreeDataPartCompact : public IMergeTreeDataPart { public: static constexpr auto DATA_FILE_NAME = "data"; - static constexpr auto DATA_FILE_EXTENSION = ".bin"; static constexpr auto DATA_FILE_NAME_WITH_EXTENSION = "data.bin"; MergeTreeDataPartCompact( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 5dd8c26f224e7833fb3bc4ce0617667fc3349e19..1da115efa707119db9db5017c36d633e9f3c56f4 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -82,9 +82,10 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl( if (checksums.empty()) return size; - column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto serialization = getSerializationForColumn(column); + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { - String file_name = IDataType::getFileNameForStream(column, substream_path); + String file_name = ISerialization::getFileNameForStream(column, substream_path); if (processed_substreams && !processed_substreams->insert(file_name).second) return; @@ -159,19 +160,19 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const { for (const NameAndTypePair & name_type : columns) { - IDataType::SubstreamPath stream_path; - name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto serialization = getSerializationForColumn(name_type); + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { - String file_name = IDataType::getFileNameForStream(name_type, substream_path); + String file_name = ISerialization::getFileNameForStream(name_type, substream_path); String mrk_file_name = file_name + index_granularity_info.marks_file_extension; - String bin_file_name = file_name + ".bin"; + String bin_file_name = file_name + DATA_FILE_EXTENSION; if (!checksums.files.count(mrk_file_name)) throw Exception("No " + mrk_file_name + " file checksum for column " + name_type.name + " in part " + fullPath(volume->getDisk(), path), ErrorCodes::NO_FILE_IN_DATA_PART); if (!checksums.files.count(bin_file_name)) throw Exception("No " + bin_file_name + " file checksum for column " + name_type.name + " in part " + fullPath(volume->getDisk(), path), ErrorCodes::NO_FILE_IN_DATA_PART); - }, stream_path); + }); } } @@ -182,9 +183,15 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const std::optional marks_size; for (const NameAndTypePair & name_type : columns) { - name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto serialization = IDataType::getSerialization(name_type, + [&](const String & stream_name) + { + return volume->getDisk()->exists(stream_name + DATA_FILE_EXTENSION); + }); + + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { - auto file_path = path + IDataType::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension; + auto file_path = path + ISerialization::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension; /// Missing file is Ok for case when new column was added. if (volume->getDisk()->exists(file_path)) @@ -208,18 +215,22 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const { - bool res = true; - - column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto check_stream_exists = [this](const String & stream_name) { - String file_name = IDataType::getFileNameForStream(column, substream_path); + auto bin_checksum = checksums.files.find(stream_name + DATA_FILE_EXTENSION); + auto mrk_checksum = checksums.files.find(stream_name + index_granularity_info.marks_file_extension); - auto bin_checksum = checksums.files.find(file_name + ".bin"); - auto mrk_checksum = checksums.files.find(file_name + index_granularity_info.marks_file_extension); + return bin_checksum != checksums.files.end() && mrk_checksum != checksums.files.end(); + }; - if (bin_checksum == checksums.files.end() || mrk_checksum == checksums.files.end()) + bool res = true; + auto serialization = IDataType::getSerialization(column, check_stream_exists); + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + { + String file_name = ISerialization::getFileNameForStream(column, substream_path); + if (!check_stream_exists(file_name)) res = false; - }, {}); + }); return res; } @@ -227,10 +238,11 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const String MergeTreeDataPartWide::getFileNameForColumn(const NameAndTypePair & column) const { String filename; - column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto serialization = column.type->getDefaultSerialization(); + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { if (filename.empty()) - filename = IDataType::getFileNameForStream(column, substream_path); + filename = ISerialization::getFileNameForStream(column, substream_path); }); return filename; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 20a32a461852f36b7bbaa95d2446be43f078f31d..2efda206cf9bbcb4d90d56554ed0b4c0eea6b1e8 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -39,9 +39,9 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc) { - IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & substream_type) + IDataType::StreamCallbackWithType callback = [&] (const ISerialization::SubstreamPath & substream_path, const IDataType & substream_type) { - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); /// Shared offsets for Nested type. if (compressed_streams.count(stream_name)) @@ -50,7 +50,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, CompressionCodecPtr compression_codec; /// If we can use special codec than just get it - if (IDataType::isSpecialCompressionAllowed(substream_path)) + if (ISerialization::isSpecialCompressionAllowed(substream_path)) compression_codec = CompressionCodecFactory::instance().get(effective_codec_desc, &substream_type, default_codec); else /// otherwise return only generic codecs and don't use info about data_type compression_codec = CompressionCodecFactory::instance().get(effective_codec_desc, nullptr, default_codec, true); @@ -63,8 +63,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, compressed_streams.emplace(stream_name, stream); }; - IDataType::SubstreamPath stream_path; - column.type->enumerateStreams(callback, stream_path); + column.type->enumerateStreams(serializations[column.name], callback); } namespace @@ -106,20 +105,21 @@ Granules getGranulesToWrite(const MergeTreeIndexGranularity & index_granularity, /// Write single granule of one column (rows between 2 marks) void writeColumnSingleGranule( const ColumnWithTypeAndName & column, - IDataType::OutputStreamGetter stream_getter, + const SerializationPtr & serialization, + ISerialization::OutputStreamGetter stream_getter, size_t from_row, size_t number_of_rows) { - IDataType::SerializeBinaryBulkStatePtr state; - IDataType::SerializeBinaryBulkSettings serialize_settings; + ISerialization::SerializeBinaryBulkStatePtr state; + ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = stream_getter; serialize_settings.position_independent_encoding = true; serialize_settings.low_cardinality_max_dictionary_size = 0; - column.type->serializeBinaryBulkStatePrefix(serialize_settings, state); - column.type->serializeBinaryBulkWithMultipleStreams(*column.column, from_row, number_of_rows, serialize_settings, state); - column.type->serializeBinaryBulkStateSuffix(serialize_settings, state); + serialization->serializeBinaryBulkStatePrefix(serialize_settings, state); + serialization->serializeBinaryBulkWithMultipleStreams(*column.column, from_row, number_of_rows, serialize_settings, state); + serialization->serializeBinaryBulkStateSuffix(serialize_settings, state); } } @@ -181,9 +181,9 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G /// So we flush each stream (using next()) before using new one, because otherwise we will override /// data in result file. CompressedStreamPtr prev_stream; - auto stream_getter = [&, this](const IDataType::SubstreamPath & substream_path) -> WriteBuffer * + auto stream_getter = [&, this](const ISerialization::SubstreamPath & substream_path) -> WriteBuffer * { - String stream_name = IDataType::getFileNameForStream(*name_and_type, substream_path); + String stream_name = ISerialization::getFileNameForStream(*name_and_type, substream_path); auto & result_stream = compressed_streams[stream_name]; /// Write one compressed block per column in granule for more optimal reading. @@ -203,9 +203,9 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G writeIntBinary(plain_hashing.count(), marks); writeIntBinary(UInt64(0), marks); - //const auto & col = block.getByName(name_and_type->name); - //std::cerr << "======== writing " << col.dumpStructure() << std::endl; - writeColumnSingleGranule(block.getByName(name_and_type->name), stream_getter, granule.start_row, granule.rows_to_write); + writeColumnSingleGranule( + block.getByName(name_and_type->name), serializations[name_and_type->name], + stream_getter, granule.start_row, granule.rows_to_write); /// Each type always have at least one substream prev_stream->hashing_buf.next(); //-V522 diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index fcc4f92be24a37e9ca2a4d64aa17576da0dc7fd7..9902add9847d7d463b80961dc910ca767a95c535 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -90,6 +90,9 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( if (!disk->exists(part_path)) disk->createDirectories(part_path); + for (const auto & column : columns_list) + serializations.emplace(column.name, column.type->getDefaultSerialization()); + if (settings.rewrite_primary_key) initPrimaryIndex(); initSkipIndices(); @@ -200,7 +203,7 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializePrimaryIndex(const Bloc { const auto & primary_column = primary_index_block.getByPosition(j); index_columns[j]->insertFrom(*primary_column.column, granule.start_row); - primary_column.type->serializeBinary(*primary_column.column, granule.start_row, *index_stream); + primary_column.type->getDefaultSerialization()->serializeBinary(*primary_column.column, granule.start_row, *index_stream); } } } @@ -265,7 +268,7 @@ void MergeTreeDataPartWriterOnDisk::finishPrimaryIndexSerialization( const auto & column = *last_block_index_columns[j]; size_t last_row_number = column.size() - 1; index_columns[j]->insertFrom(column, last_row_number); - index_types[j]->serializeBinary(column, last_row_number, *index_stream); + index_types[j]->getDefaultSerialization()->serializeBinary(column, last_row_number, *index_stream); } last_block_index_columns.clear(); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 704b38ba6d533df32fe76f043b1a4ef7bf6a5395..d952950e4619ba9ded92e70fd77b9e6cc00e9abc 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -132,6 +132,9 @@ protected: MergeTreeIndexAggregators skip_indices_aggregators; std::vector skip_index_accumulated_marks; + using SerializationsMap = std::unordered_map; + SerializationsMap serializations; + std::unique_ptr index_file_stream; std::unique_ptr index_stream; DataTypes index_types; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 866980b81f53912b86cd3c376b4818aecc135f8b..a2f7440b2e39cc2a769cb92973e042fd671abffe 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -88,18 +89,18 @@ void MergeTreeDataPartWriterWide::addStreams( const NameAndTypePair & column, const ASTPtr & effective_codec_desc) { - IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & substream_type) + IDataType::StreamCallbackWithType callback = [&] (const ISerialization::SubstreamPath & substream_path, const IDataType & substream_type) { - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); /// Shared offsets for Nested type. if (column_streams.count(stream_name)) return; CompressionCodecPtr compression_codec; /// If we can use special codec then just get it - if (IDataType::isSpecialCompressionAllowed(substream_path)) + if (ISerialization::isSpecialCompressionAllowed(substream_path)) compression_codec = CompressionCodecFactory::instance().get(effective_codec_desc, &substream_type, default_codec); - else /// otherwise return only generic codecs and don't use info about the data_type + else /// otherwise return only generic codecs and don't use info about the` data_type compression_codec = CompressionCodecFactory::instance().get(effective_codec_desc, nullptr, default_codec, true); column_streams[stream_name] = std::make_unique( @@ -111,19 +112,18 @@ void MergeTreeDataPartWriterWide::addStreams( settings.max_compress_block_size); }; - IDataType::SubstreamPath stream_path; - column.type->enumerateStreams(callback, stream_path); + column.type->enumerateStreams(serializations[column.name], callback); } -IDataType::OutputStreamGetter MergeTreeDataPartWriterWide::createStreamGetter( +ISerialization::OutputStreamGetter MergeTreeDataPartWriterWide::createStreamGetter( const NameAndTypePair & column, WrittenOffsetColumns & offset_columns) const { - return [&, this] (const IDataType::SubstreamPath & substream_path) -> WriteBuffer * + return [&, this] (const ISerialization::SubstreamPath & substream_path) -> WriteBuffer * { - bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; + bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.count(stream_name)) @@ -242,7 +242,7 @@ void MergeTreeDataPartWriterWide::writeSingleMark( const NameAndTypePair & column, WrittenOffsetColumns & offset_columns, size_t number_of_rows, - DB::IDataType::SubstreamPath & path) + ISerialization::SubstreamPath & path) { StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns, path); for (const auto & mark : marks) @@ -261,14 +261,14 @@ void MergeTreeDataPartWriterWide::flushMarkToFile(const StreamNameAndMark & stre StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( const NameAndTypePair & column, WrittenOffsetColumns & offset_columns, - DB::IDataType::SubstreamPath & path) + ISerialization::SubstreamPath & path) { StreamsWithMarks result; - column.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { - bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; + bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.count(stream_name)) @@ -295,18 +295,19 @@ void MergeTreeDataPartWriterWide::writeSingleGranule( const NameAndTypePair & name_and_type, const IColumn & column, WrittenOffsetColumns & offset_columns, - IDataType::SerializeBinaryBulkStatePtr & serialization_state, - IDataType::SerializeBinaryBulkSettings & serialize_settings, + ISerialization::SerializeBinaryBulkStatePtr & serialization_state, + ISerialization::SerializeBinaryBulkSettings & serialize_settings, const Granule & granule) { - name_and_type.type->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state); + const auto & serialization = serializations[name_and_type.name]; + serialization->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state); /// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one. - name_and_type.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { - bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; + bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; - String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.count(stream_name)) @@ -331,13 +332,13 @@ void MergeTreeDataPartWriterWide::writeColumn( if (inserted) { - IDataType::SerializeBinaryBulkSettings serialize_settings; + ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); - type->serializeBinaryBulkStatePrefix(serialize_settings, it->second); + serializations[name]->serializeBinaryBulkStatePrefix(serialize_settings, it->second); } const auto & global_settings = storage.global_context.getSettingsRef(); - IDataType::SerializeBinaryBulkSettings serialize_settings; + ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); serialize_settings.low_cardinality_max_dictionary_size = global_settings.low_cardinality_max_dictionary_size; serialize_settings.low_cardinality_use_single_dictionary_for_part = global_settings.low_cardinality_use_single_dictionary_for_part != 0; @@ -374,12 +375,12 @@ void MergeTreeDataPartWriterWide::writeColumn( } } - name_and_type.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + serializations[name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { - bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; + bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) { - String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); offset_columns.insert(stream_name); } }, serialize_settings.path); @@ -403,6 +404,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, size_t mark_num; + const auto & serialization = serializations[name]; for (mark_num = 0; !mrk_in.eof(); ++mark_num) { if (mark_num > index_granularity.getMarksCount()) @@ -430,7 +432,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, { auto column = type.createColumn(); - type.deserializeBinaryBulk(*column, bin_in, 1000000000, 0.0); + serialization->deserializeBinaryBulk(*column, bin_in, 1000000000, 0.0); throw Exception(ErrorCodes::LOGICAL_ERROR, "Still have {} rows in bin stream, last mark #{} index granularity size {}, last rows {}", column->size(), mark_num, index_granularity.getMarksCount(), index_granularity_rows); @@ -450,7 +452,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, auto column = type.createColumn(); - type.deserializeBinaryBulk(*column, bin_in, index_granularity_rows, 0.0); + serialization->deserializeBinaryBulk(*column, bin_in, index_granularity_rows, 0.0); if (bin_in.eof()) { @@ -489,7 +491,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, { auto column = type.createColumn(); - type.deserializeBinaryBulk(*column, bin_in, 1000000000, 0.0); + serialization->deserializeBinaryBulk(*column, bin_in, 1000000000, 0.0); throw Exception(ErrorCodes::LOGICAL_ERROR, "Still have {} rows in bin stream, last mark #{} index granularity size {}, last rows {}", column->size(), mark_num, index_granularity.getMarksCount(), index_granularity_rows); @@ -500,7 +502,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Checksums & checksums, bool sync) { const auto & global_settings = storage.global_context.getSettingsRef(); - IDataType::SerializeBinaryBulkSettings serialize_settings; + ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.low_cardinality_max_dictionary_size = global_settings.low_cardinality_max_dictionary_size; serialize_settings.low_cardinality_use_single_dictionary_for_part = global_settings.low_cardinality_use_single_dictionary_for_part != 0; WrittenOffsetColumns offset_columns; @@ -523,7 +525,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch if (!serialization_states.empty()) { serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns); - it->type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); + serializations[it->name]->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); } if (write_final_mark) @@ -565,16 +567,16 @@ void MergeTreeDataPartWriterWide::finish(IMergeTreeDataPart::Checksums & checksu void MergeTreeDataPartWriterWide::writeFinalMark( const NameAndTypePair & column, WrittenOffsetColumns & offset_columns, - DB::IDataType::SubstreamPath & path) + ISerialization::SubstreamPath & path) { writeSingleMark(column, offset_columns, 0, path); /// Memoize information about offsets - column.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { - bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; + bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) { - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); offset_columns.insert(stream_name); } }, path); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index e6f96f3f146f5dd5ee493b1262f6af92930be4c4..5eaaa0c1bbe245d6147caab9aaca11ad90ae3905 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -50,15 +50,15 @@ private: const NameAndTypePair & name_and_type, const IColumn & column, WrittenOffsetColumns & offset_columns, - IDataType::SerializeBinaryBulkStatePtr & serialization_state, - IDataType::SerializeBinaryBulkSettings & serialize_settings, + ISerialization::SerializeBinaryBulkStatePtr & serialization_state, + ISerialization::SerializeBinaryBulkSettings & serialize_settings, const Granule & granule); /// Take offsets from column and return as MarkInCompressed file with stream name StreamsWithMarks getCurrentMarksForColumn( const NameAndTypePair & column, WrittenOffsetColumns & offset_columns, - DB::IDataType::SubstreamPath & path); + ISerialization::SubstreamPath & path); /// Write mark to disk using stream and rows count void flushMarkToFile( @@ -70,12 +70,12 @@ private: const NameAndTypePair & column, WrittenOffsetColumns & offset_columns, size_t number_of_rows, - DB::IDataType::SubstreamPath & path); + ISerialization::SubstreamPath & path); void writeFinalMark( const NameAndTypePair & column, WrittenOffsetColumns & offset_columns, - DB::IDataType::SubstreamPath & path); + ISerialization::SubstreamPath & path); void addStreams( const NameAndTypePair & column, @@ -100,15 +100,16 @@ private: /// Also useful to have exact amount of rows in last (non-final) mark. void adjustLastMarkIfNeedAndFlushToDisk(size_t new_rows_in_last_mark); - IDataType::OutputStreamGetter createStreamGetter(const NameAndTypePair & column, WrittenOffsetColumns & offset_columns) const; + ISerialization::OutputStreamGetter createStreamGetter(const NameAndTypePair & column, WrittenOffsetColumns & offset_columns) const; - using SerializationState = IDataType::SerializeBinaryBulkStatePtr; + using SerializationState = ISerialization::SerializeBinaryBulkStatePtr; using SerializationStates = std::unordered_map; SerializationStates serialization_states; using ColumnStreams = std::map; ColumnStreams column_streams; + /// Non written marks to disk (for each column). Waiting until all rows for /// this marks will be written to disk. using MarksForColumns = std::unordered_map; diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 019cad2e195320685bd5378e9ba2e20a46986dd3..3b4cb385a34074bf74158149054ca6a7d5e6d6a2 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -352,6 +352,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(BlockWithPa new_data_part->uuid = UUIDHelpers::generateV4(); new_data_part->setColumns(columns); + new_data_part->rows_count = block.rows(); new_data_part->partition = std::move(partition); new_data_part->minmax_idx = std::move(minmax_idx); new_data_part->is_temp = true; diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index de89a27ab460aeb5d277e6da240ab8b6cbd3e0de..e8b526d1426bbe069fe3f86d5301da32b146ec56 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -37,10 +37,12 @@ void MergeTreeIndexGranuleMinMax::serializeBinary(WriteBuffer & ostr) const for (size_t i = 0; i < index_sample_block.columns(); ++i) { const DataTypePtr & type = index_sample_block.getByPosition(i).type; + auto serialization = type->getDefaultSerialization(); + if (!type->isNullable()) { - type->serializeBinary(hyperrectangle[i].left, ostr); - type->serializeBinary(hyperrectangle[i].right, ostr); + serialization->serializeBinary(hyperrectangle[i].left, ostr); + serialization->serializeBinary(hyperrectangle[i].right, ostr); } else { @@ -48,8 +50,8 @@ void MergeTreeIndexGranuleMinMax::serializeBinary(WriteBuffer & ostr) const writeBinary(is_null, ostr); if (!is_null) { - type->serializeBinary(hyperrectangle[i].left, ostr); - type->serializeBinary(hyperrectangle[i].right, ostr); + serialization->serializeBinary(hyperrectangle[i].left, ostr); + serialization->serializeBinary(hyperrectangle[i].right, ostr); } } } @@ -60,13 +62,17 @@ void MergeTreeIndexGranuleMinMax::deserializeBinary(ReadBuffer & istr) hyperrectangle.clear(); Field min_val; Field max_val; + + for (size_t i = 0; i < index_sample_block.columns(); ++i) { const DataTypePtr & type = index_sample_block.getByPosition(i).type; + auto serialization = type->getDefaultSerialization(); + if (!type->isNullable()) { - type->deserializeBinary(min_val, istr); - type->deserializeBinary(max_val, istr); + serialization->deserializeBinary(min_val, istr); + serialization->deserializeBinary(max_val, istr); } else { @@ -74,8 +80,8 @@ void MergeTreeIndexGranuleMinMax::deserializeBinary(ReadBuffer & istr) readBinary(is_null, istr); if (!is_null) { - type->deserializeBinary(min_val, istr); - type->deserializeBinary(max_val, istr); + serialization->deserializeBinary(min_val, istr); + serialization->deserializeBinary(max_val, istr); } else { diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index b6706367bfa066fb8e8e28f884d403b5843946a0..4ab6ae01c8c0965054072609e3792e99f60782d0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -52,28 +52,31 @@ void MergeTreeIndexGranuleSet::serializeBinary(WriteBuffer & ostr) const "Attempt to write empty set index " + backQuote(index_name), ErrorCodes::LOGICAL_ERROR); const auto & size_type = DataTypePtr(std::make_shared()); + auto size_serialization = size_type->getDefaultSerialization(); if (max_rows != 0 && size() > max_rows) { - size_type->serializeBinary(0, ostr); + size_serialization->serializeBinary(0, ostr); return; } - size_type->serializeBinary(size(), ostr); + size_serialization->serializeBinary(size(), ostr); for (size_t i = 0; i < index_sample_block.columns(); ++i) { const auto & type = index_sample_block.getByPosition(i).type; - IDataType::SerializeBinaryBulkSettings settings; - settings.getter = [&ostr](IDataType::SubstreamPath) -> WriteBuffer * { return &ostr; }; + ISerialization::SerializeBinaryBulkSettings settings; + settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; settings.position_independent_encoding = false; settings.low_cardinality_max_dictionary_size = 0; - IDataType::SerializeBinaryBulkStatePtr state; - type->serializeBinaryBulkStatePrefix(settings, state); - type->serializeBinaryBulkWithMultipleStreams(*block.getByPosition(i).column, 0, size(), settings, state); - type->serializeBinaryBulkStateSuffix(settings, state); + auto serialization = type->getDefaultSerialization(); + ISerialization::SerializeBinaryBulkStatePtr state; + + serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkWithMultipleStreams(*block.getByPosition(i).column, 0, size(), settings, state); + serialization->serializeBinaryBulkStateSuffix(settings, state); } } @@ -83,7 +86,7 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr) Field field_rows; const auto & size_type = DataTypePtr(std::make_shared()); - size_type->deserializeBinary(field_rows, istr); + size_type->getDefaultSerialization()->deserializeBinary(field_rows, istr); size_t rows_to_read = field_rows.get(); if (rows_to_read == 0) @@ -95,13 +98,16 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr) const auto & type = column.type; ColumnPtr new_column = type->createColumn(); - IDataType::DeserializeBinaryBulkSettings settings; - settings.getter = [&](IDataType::SubstreamPath) -> ReadBuffer * { return &istr; }; + + ISerialization::DeserializeBinaryBulkSettings settings; + settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; settings.position_independent_encoding = false; - IDataType::DeserializeBinaryBulkStatePtr state; - type->deserializeBinaryBulkStatePrefix(settings, state); - type->deserializeBinaryBulkWithMultipleStreams(new_column, rows_to_read, settings, state); + ISerialization::DeserializeBinaryBulkStatePtr state; + auto serialization = type->getDefaultSerialization(); + + serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkWithMultipleStreams(new_column, rows_to_read, settings, state, nullptr); block.insert(ColumnWithTypeAndName(new_column, type, column.name)); } diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index 9b02b9f1fd8ed111540f9cfeb8cae8422331f77f..897b868db258f902eaa79154d916c52f0ef10daf 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -102,7 +102,7 @@ void MergeTreePartition::serializeText(const MergeTreeData & storage, WriteBuffe const DataTypePtr & type = partition_key_sample.getByPosition(0).type; auto column = type->createColumn(); column->insert(value[0]); - type->serializeAsText(*column, 0, out, format_settings); + type->getDefaultSerialization()->serializeText(*column, 0, out, format_settings); } else { @@ -117,9 +117,9 @@ void MergeTreePartition::serializeText(const MergeTreeData & storage, WriteBuffe columns.push_back(std::move(column)); } - DataTypeTuple tuple_type(types); + auto tuple_serialization = DataTypeTuple(types).getDefaultSerialization(); auto tuple_column = ColumnTuple::create(columns); - tuple_type.serializeText(*tuple_column, 0, out, format_settings); + tuple_serialization->serializeText(*tuple_column, 0, out, format_settings); } } @@ -134,7 +134,7 @@ void MergeTreePartition::load(const MergeTreeData & storage, const DiskPtr & dis auto file = openForReading(disk, partition_file_path); value.resize(partition_key_sample.columns()); for (size_t i = 0; i < partition_key_sample.columns(); ++i) - partition_key_sample.getByPosition(i).type->deserializeBinary(value[i], *file); + partition_key_sample.getByPosition(i).type->getDefaultSerialization()->deserializeBinary(value[i], *file); } void MergeTreePartition::store(const MergeTreeData & storage, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums) const @@ -152,7 +152,7 @@ void MergeTreePartition::store(const Block & partition_key_sample, const DiskPtr auto out = disk->writeFile(part_path + "partition.dat"); HashingWriteBuffer out_hashing(*out); for (size_t i = 0; i < value.size(); ++i) - partition_key_sample.getByPosition(i).type->serializeBinary(value[i], out_hashing); + partition_key_sample.getByPosition(i).type->getDefaultSerialization()->serializeBinary(value[i], out_hashing); out_hashing.next(); checksums.files["partition.dat"].file_size = out_hashing.count(); checksums.files["partition.dat"].file_hash = out_hashing.getHash(); diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index b135ea0403252913baac9b1e00edd0625202a55c..da28f75b57f7706b021ce4d423dc0ca45ea398a3 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -206,16 +206,16 @@ void MergeTreeReaderCompact::readData( if (!isContinuousReading(from_mark, column_position)) seekToMark(from_mark, column_position); - auto buffer_getter = [&](const IDataType::SubstreamPath & substream_path) -> ReadBuffer * + auto buffer_getter = [&](const ISerialization::SubstreamPath & substream_path) -> ReadBuffer * { - if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != IDataType::Substream::ArraySizes)) + if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != ISerialization::Substream::ArraySizes)) return nullptr; return data_buffer; }; - IDataType::DeserializeBinaryBulkStatePtr state; - IDataType::DeserializeBinaryBulkSettings deserialize_settings; + ISerialization::DeserializeBinaryBulkStatePtr state; + ISerialization::DeserializeBinaryBulkSettings deserialize_settings; deserialize_settings.getter = buffer_getter; deserialize_settings.avg_value_size_hint = avg_value_size_hints[name]; @@ -224,14 +224,16 @@ void MergeTreeReaderCompact::readData( auto type_in_storage = name_and_type.getTypeInStorage(); ColumnPtr temp_column = type_in_storage->createColumn(); - type_in_storage->deserializeBinaryBulkStatePrefix(deserialize_settings, state); - type_in_storage->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, state); + auto serialization = type_in_storage->getDefaultSerialization(); + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, state); + serialization->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, state, nullptr); column = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), *temp_column); } else { - type->deserializeBinaryBulkStatePrefix(deserialize_settings, state); - type->deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state); + auto serialization = type->getDefaultSerialization(); + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, state); + serialization->deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state, nullptr); } /// The buffer is left in inconsistent state after reading single offsets diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 30db54fc8e062804033867166347c0ca9f792faf..0da2f643eb01788703fd9836f685af8f3c6bf607 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -72,7 +72,7 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si /// If append is true, then the value will be equal to nullptr and will be used only to /// check that the offsets column has been already read. OffsetColumns offset_columns; - std::unordered_map caches; + std::unordered_map caches; auto name_and_type = columns.begin(); for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type) @@ -137,9 +137,9 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type, const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type) { - IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::StreamCallback callback = [&] (const ISerialization::SubstreamPath & substream_path) { - String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); if (streams.count(stream_name)) return; @@ -160,25 +160,26 @@ void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type, profile_callback, clock_type)); }; - IDataType::SubstreamPath substream_path; - name_and_type.type->enumerateStreams(callback, substream_path); + auto serialization = data_part->getSerializationForColumn(name_and_type); + serialization->enumerateStreams(callback); + serializations.emplace(name_and_type.name, std::move(serialization)); } void MergeTreeReaderWide::readData( const NameAndTypePair & name_and_type, ColumnPtr & column, size_t from_mark, bool continue_reading, size_t max_rows_to_read, - IDataType::SubstreamsCache & cache) + ISerialization::SubstreamsCache & cache) { - auto get_stream_getter = [&](bool stream_for_prefix) -> IDataType::InputStreamGetter + auto get_stream_getter = [&](bool stream_for_prefix) -> ISerialization::InputStreamGetter { - return [&, stream_for_prefix](const IDataType::SubstreamPath & substream_path) -> ReadBuffer * + return [&, stream_for_prefix](const ISerialization::SubstreamPath & substream_path) -> ReadBuffer * { /// If substream have already been read. - if (cache.count(IDataType::getSubcolumnNameForStream(substream_path))) + if (cache.count(ISerialization::getSubcolumnNameForStream(substream_path))) return nullptr; - String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); auto it = streams.find(stream_name); if (it == streams.end()) @@ -199,19 +200,23 @@ void MergeTreeReaderWide::readData( }; double & avg_value_size_hint = avg_value_size_hints[name_and_type.name]; - IDataType::DeserializeBinaryBulkSettings deserialize_settings; + ISerialization::DeserializeBinaryBulkSettings deserialize_settings; deserialize_settings.avg_value_size_hint = avg_value_size_hint; - if (deserialize_binary_bulk_state_map.count(name_and_type.name) == 0) + const auto & name = name_and_type.name; + auto serialization = serializations[name]; + + if (deserialize_binary_bulk_state_map.count(name) == 0) { deserialize_settings.getter = get_stream_getter(true); - name_and_type.type->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name_and_type.name]); + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]); } deserialize_settings.getter = get_stream_getter(false); deserialize_settings.continuous_reading = continue_reading; - auto & deserialize_state = deserialize_binary_bulk_state_map[name_and_type.name]; - name_and_type.type->deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, deserialize_settings, deserialize_state, &cache); + auto & deserialize_state = deserialize_binary_bulk_state_map[name]; + + serializations[name]->deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, deserialize_settings, deserialize_state, &cache); IDataType::updateAvgValueSizeHint(*column, avg_value_size_hint); } diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h index bf9e97035d02ccda1ab1ed8cc07f940c406720cc..1afbca4bf41d28cc9a291fd66c024efc8bfd6e77 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.h +++ b/src/Storages/MergeTree/MergeTreeReaderWide.h @@ -34,8 +34,10 @@ public: private: using FileStreams = std::map>; + using Serializations = std::map; FileStreams streams; + Serializations serializations; void addStreams(const NameAndTypePair & name_and_type, const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type); @@ -43,7 +45,7 @@ private: void readData( const NameAndTypePair & name_and_type, ColumnPtr & column, size_t from_mark, bool continue_reading, size_t max_rows_to_read, - IDataType::SubstreamsCache & cache); + ISerialization::SubstreamsCache & cache); }; } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 1b852622efc74fc3fcb05d0a4426b96917e9e3de..6988d48b18ca8cde98cbd516984e4220f5661983 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -186,7 +186,6 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm return; writer->write(block, permutation); - rows_count += rows; } diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index c9da156dc9751ce76fd58d4d21621cfdeba657b9..ac28f84db438db8a272eda8a66e79717efdf0a00 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -120,9 +120,15 @@ IMergeTreeDataPart::Checksums checkDataPart( { for (const auto & column : columns_list) { - column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + auto serialization = IDataType::getSerialization(column, + [&](const String & stream_name) + { + return disk->exists(stream_name + IMergeTreeDataPart::DATA_FILE_EXTENSION); + }); + + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { - String file_name = IDataType::getFileNameForStream(column, substream_path) + ".bin"; + String file_name = ISerialization::getFileNameForStream(column, substream_path) + ".bin"; checksums_data.files[file_name] = checksum_compressed_file(disk, path + file_name); }, {}); } diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp index 35c41cabd8b3bfb7102cb8f1ee2e81ce5b3fe962..4900e17ad914a2b893fe3702e5fa3231c0c38a9a 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBBlockInputStream.cpp @@ -48,7 +48,8 @@ Block EmbeddedRocksDBBlockInputStream::readImpl() size_t idx = 0; for (const auto & elem : sample_block) { - elem.type->deserializeBinary(*columns[idx], idx == primary_key_pos ? key_buffer : value_buffer); + auto serialization = elem.type->getDefaultSerialization(); + serialization->deserializeBinary(*columns[idx], idx == primary_key_pos ? key_buffer : value_buffer); ++idx; } } diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBlockOutputStream.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBlockOutputStream.cpp index 1edbdc25942e8422a9e428e23305fe38a67f9e5b..d7b125cb41f0dc0bc889a341a7c52f37125d580a 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBlockOutputStream.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBBlockOutputStream.cpp @@ -51,7 +51,7 @@ void EmbeddedRocksDBBlockOutputStream::write(const Block & block) size_t idx = 0; for (const auto & elem : block) { - elem.type->serializeBinary(*elem.column, i, idx == primary_key_pos ? wb_key : wb_value); + elem.type->getDefaultSerialization()->serializeBinary(*elem.column, i, idx == primary_key_pos ? wb_key : wb_value); ++idx; } status = batch.Put(wb_key.str(), wb_value.str()); diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index d745696646750d77e4e93335e305db8089ea5c14..9b0a0c36b453947864cffb9a6b9cfb055b4f3ed8 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -198,7 +198,7 @@ public: while (it < end && rows_processed < max_block_size) { WriteBufferFromString wb(serialized_keys[rows_processed]); - key_column.type->serializeBinary(*it, wb); + key_column.type->getDefaultSerialization()->serializeBinary(*it, wb); wb.finalize(); slices_keys[rows_processed] = std::move(serialized_keys[rows_processed]); @@ -219,7 +219,7 @@ public: size_t idx = 0; for (const auto & elem : sample_block) { - elem.type->deserializeBinary(*columns[idx], idx == primary_key_pos ? key_buffer : value_buffer); + elem.type->getDefaultSerialization()->deserializeBinary(*columns[idx], idx == primary_key_pos ? key_buffer : value_buffer); ++idx; } } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 3ac6dde836bdd5ded23b78109fa82831f141214a..28156458342a4932986b7c4d6b5e0c03b28a85e6 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -907,7 +907,24 @@ ClusterPtr StorageDistributed::skipUnusedShards( } replaceConstantExpressions(condition_ast, context, metadata_snapshot->getColumns().getAll(), shared_from_this(), metadata_snapshot); - const auto blocks = evaluateExpressionOverConstantCondition(condition_ast, sharding_key_expr); + + size_t limit = context.getSettingsRef().optimize_skip_unused_shards_limit; + if (!limit || limit > SSIZE_MAX) + { + throw Exception("optimize_skip_unused_shards_limit out of range (0, {}]", ErrorCodes::ARGUMENT_OUT_OF_BOUND, SSIZE_MAX); + } + // To interpret limit==0 as limit is reached + ++limit; + const auto blocks = evaluateExpressionOverConstantCondition(condition_ast, sharding_key_expr, limit); + + if (!limit) + { + LOG_TRACE(log, + "Number of values for sharding key exceeds optimize_skip_unused_shards_limit={}, " + "try to increase it, but note that this may increase query processing time.", + context.getSettingsRef().optimize_skip_unused_shards_limit); + return nullptr; + } // Can't get definite answer if we can skip any shards if (!blocks) diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 02172517eb1df5f1d40fa6012c08c69f5bc292aa..ddb0cadc1483f80c39b18e2229aea82b5c353b0a 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -87,6 +87,8 @@ private: size_t rows_read = 0; size_t max_read_buffer_size; + std::unordered_map serializations; + struct Stream { Stream(const DiskPtr & disk, const String & data_path, size_t offset, size_t max_read_buffer_size_) @@ -104,11 +106,11 @@ private: using FileStreams = std::map; FileStreams streams; - using DeserializeState = IDataType::DeserializeBinaryBulkStatePtr; + using DeserializeState = ISerialization::DeserializeBinaryBulkStatePtr; using DeserializeStates = std::map; DeserializeStates deserialize_states; - void readData(const NameAndTypePair & name_and_type, ColumnPtr & column, size_t max_rows_to_read, IDataType::SubstreamsCache & cache); + void readData(const NameAndTypePair & name_and_type, ColumnPtr & column, size_t max_rows_to_read, ISerialization::SubstreamsCache & cache); }; @@ -124,7 +126,7 @@ Chunk LogSource::generate() /// How many rows to read for the next block. size_t max_rows_to_read = std::min(block_size, rows_limit - rows_read); - std::unordered_map caches; + std::unordered_map caches; for (const auto & name_type : columns) { @@ -162,19 +164,20 @@ Chunk LogSource::generate() void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & column, - size_t max_rows_to_read, IDataType::SubstreamsCache & cache) + size_t max_rows_to_read, ISerialization::SubstreamsCache & cache) { - IDataType::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint. + ISerialization::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint. const auto & [name, type] = name_and_type; + auto serialization = IDataType::getSerialization(name_and_type); auto create_stream_getter = [&](bool stream_for_prefix) { - return [&, stream_for_prefix] (const IDataType::SubstreamPath & path) -> ReadBuffer * + return [&, stream_for_prefix] (const ISerialization::SubstreamPath & path) -> ReadBuffer * { - if (cache.count(IDataType::getSubcolumnNameForStream(path))) + if (cache.count(ISerialization::getSubcolumnNameForStream(path))) return nullptr; - String stream_name = IDataType::getFileNameForStream(name_and_type, path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, path); const auto & file_it = storage.files.find(stream_name); if (storage.files.end() == file_it) throw Exception("Logical error: no information about file " + stream_name + " in StorageLog", ErrorCodes::LOGICAL_ERROR); @@ -193,11 +196,11 @@ void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & colu if (deserialize_states.count(name) == 0) { settings.getter = create_stream_getter(true); - type->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]); + serialization->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]); } settings.getter = create_stream_getter(false); - type->deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, settings, deserialize_states[name], &cache); + serialization->deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, settings, deserialize_states[name], &cache); } @@ -282,11 +285,11 @@ private: std::unique_ptr marks_stream; /// Declared below `lock` to make the file open when rwlock is captured. - using SerializeState = IDataType::SerializeBinaryBulkStatePtr; + using SerializeState = ISerialization::SerializeBinaryBulkStatePtr; using SerializeStates = std::map; SerializeStates serialize_states; - IDataType::OutputStreamGetter createStreamGetter(const NameAndTypePair & name_and_type, WrittenStreams & written_streams); + ISerialization::OutputStreamGetter createStreamGetter(const NameAndTypePair & name_and_type, WrittenStreams & written_streams); void writeData( const NameAndTypePair & name_and_type, @@ -324,14 +327,15 @@ void LogBlockOutputStream::writeSuffix() return; WrittenStreams written_streams; - IDataType::SerializeBinaryBulkSettings settings; + ISerialization::SerializeBinaryBulkSettings settings; for (const auto & column : getHeader()) { auto it = serialize_states.find(column.name); if (it != serialize_states.end()) { settings.getter = createStreamGetter(NameAndTypePair(column.name, column.type), written_streams); - column.type->serializeBinaryBulkStateSuffix(settings, it->second); + auto serialization = column.type->getDefaultSerialization(); + serialization->serializeBinaryBulkStateSuffix(settings, it->second); } } @@ -356,12 +360,12 @@ void LogBlockOutputStream::writeSuffix() } -IDataType::OutputStreamGetter LogBlockOutputStream::createStreamGetter(const NameAndTypePair & name_and_type, +ISerialization::OutputStreamGetter LogBlockOutputStream::createStreamGetter(const NameAndTypePair & name_and_type, WrittenStreams & written_streams) { - return [&] (const IDataType::SubstreamPath & path) -> WriteBuffer * + return [&] (const ISerialization::SubstreamPath & path) -> WriteBuffer * { - String stream_name = IDataType::getFileNameForStream(name_and_type, path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, path); if (written_streams.count(stream_name)) return nullptr; @@ -377,12 +381,13 @@ IDataType::OutputStreamGetter LogBlockOutputStream::createStreamGetter(const Nam void LogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, const IColumn & column, MarksForColumns & out_marks, WrittenStreams & written_streams) { - IDataType::SerializeBinaryBulkSettings settings; + ISerialization::SerializeBinaryBulkSettings settings; const auto & [name, type] = name_and_type; + auto serialization = type->getDefaultSerialization(); - type->enumerateStreams([&] (const IDataType::SubstreamPath & path, const IDataType & /* substream_type */) + serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & path) { - String stream_name = IDataType::getFileNameForStream(name_and_type, path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, path); if (written_streams.count(stream_name)) return; @@ -398,11 +403,11 @@ void LogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, cons settings.getter = createStreamGetter(name_and_type, written_streams); if (serialize_states.count(name) == 0) - type->serializeBinaryBulkStatePrefix(settings, serialize_states[name]); + serialization->serializeBinaryBulkStatePrefix(settings, serialize_states[name]); - type->enumerateStreams([&] (const IDataType::SubstreamPath & path, const IDataType & /* substream_type */) + serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & path) { - String stream_name = IDataType::getFileNameForStream(name_and_type, path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, path); if (written_streams.count(stream_name)) return; @@ -416,11 +421,11 @@ void LogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, cons out_marks.emplace_back(file.column_index, mark); }, settings.path); - type->serializeBinaryBulkWithMultipleStreams(column, 0, 0, settings, serialize_states[name]); + serialization->serializeBinaryBulkWithMultipleStreams(column, 0, 0, settings, serialize_states[name]); - type->enumerateStreams([&] (const IDataType::SubstreamPath & path, const IDataType & /* substream_type */) + serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & path) { - String stream_name = IDataType::getFileNameForStream(name_and_type, path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, path); if (!written_streams.emplace(stream_name).second) return; @@ -501,9 +506,9 @@ void StorageLog::addFiles(const NameAndTypePair & column) throw Exception("Duplicate column with name " + column.name + " in constructor of StorageLog.", ErrorCodes::DUPLICATE_COLUMN); - IDataType::StreamCallback stream_callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::StreamCallback stream_callback = [&] (const ISerialization::SubstreamPath & substream_path) { - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); if (!files.count(stream_name)) { @@ -516,8 +521,8 @@ void StorageLog::addFiles(const NameAndTypePair & column) } }; - IDataType::SubstreamPath substream_path; - column.type->enumerateStreams(stream_callback, substream_path); + auto serialization = column.type->getDefaultSerialization(); + serialization->enumerateStreams(stream_callback); } @@ -607,11 +612,12 @@ const StorageLog::Marks & StorageLog::getMarksWithRealRowCount(const StorageMeta * If this is a data type with multiple stream, get the first stream, that we assume have real row count. * (Example: for Array data type, first stream is array sizes; and number of array sizes is the number of arrays). */ - IDataType::SubstreamPath substream_root_path; - column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::SubstreamPath substream_root_path; + auto serialization = column.type->getDefaultSerialization(); + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { if (filename.empty()) - filename = IDataType::getFileNameForStream(column, substream_path); + filename = ISerialization::getFileNameForStream(column, substream_path); }, substream_root_path); Files::const_iterator it = files.find(filename); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index f1fb0935047764f549f59208ff6bdc3acbad0fc5..c89187a46e2e6148db1f4c77c52c766dff774f84 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -85,7 +85,7 @@ StorageMaterializedView::StorageMaterializedView( else if (attach_) { /// If there is an ATTACH request, then the internal table must already be created. - target_table_id = StorageID(getStorageID().database_name, generateInnerTableName(getStorageID())); + target_table_id = StorageID(getStorageID().database_name, generateInnerTableName(getStorageID()), query.to_inner_uuid); } else { @@ -94,6 +94,7 @@ StorageMaterializedView::StorageMaterializedView( auto manual_create_query = std::make_shared(); manual_create_query->database = getStorageID().database_name; manual_create_query->table = generateInnerTableName(getStorageID()); + manual_create_query->uuid = query.to_inner_uuid; auto new_columns_list = std::make_shared(); new_columns_list->set(new_columns_list->columns, query.columns_list->columns->ptr()); diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index 564772ee4aa8afdadbc0a6ec733bd3205bb4aba8..e1b927027f9bd5beb690f282efc9ed79596d83ba 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -147,7 +147,7 @@ public: } else { - data_types[j]->serializeAsText(*columns[j], i, ostr, FormatSettings{}); + data_types[j]->getDefaultSerialization()->serializeText(*columns[j], i, ostr, FormatSettings{}); } row[j] = ostr.str(); @@ -214,7 +214,7 @@ public: auto array_column = ColumnArray::create(createNested(nested)); array_column->insert(array_field); WriteBufferFromOwnString ostr; - data_type->serializeAsText(*array_column, 0, ostr, FormatSettings{}); + data_type->getDefaultSerialization()->serializeText(*array_column, 0, ostr, FormatSettings{}); /// ostr is guaranteed to be at least '[]', i.e. size is at least 2 and 2 only if ostr.str() == '[]' assert(ostr.str().size() >= 2); diff --git a/src/Storages/StorageTinyLog.cpp b/src/Storages/StorageTinyLog.cpp index 06e2c21b1a8c80c6e8b1e2a1c86bc154e8a5768c..3cb4be50a8651e8bae71978d5bbff3ffb6446c8a 100644 --- a/src/Storages/StorageTinyLog.cpp +++ b/src/Storages/StorageTinyLog.cpp @@ -109,11 +109,11 @@ private: using FileStreams = std::map>; FileStreams streams; - using DeserializeState = IDataType::DeserializeBinaryBulkStatePtr; + using DeserializeState = ISerialization::DeserializeBinaryBulkStatePtr; using DeserializeStates = std::map; DeserializeStates deserialize_states; - void readData(const NameAndTypePair & name_and_type, ColumnPtr & column, UInt64 limit, IDataType::SubstreamsCache & cache); + void readData(const NameAndTypePair & name_and_type, ColumnPtr & column, UInt64 limit, ISerialization::SubstreamsCache & cache); }; @@ -132,7 +132,7 @@ Chunk TinyLogSource::generate() return {}; } - std::unordered_map caches; + std::unordered_map caches; for (const auto & name_type : columns) { ColumnPtr column; @@ -162,16 +162,18 @@ Chunk TinyLogSource::generate() void TinyLogSource::readData(const NameAndTypePair & name_and_type, - ColumnPtr & column, UInt64 limit, IDataType::SubstreamsCache & cache) + ColumnPtr & column, UInt64 limit, ISerialization::SubstreamsCache & cache) { - IDataType::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint. + ISerialization::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint. const auto & [name, type] = name_and_type; - settings.getter = [&] (const IDataType::SubstreamPath & path) -> ReadBuffer * + auto serialization = IDataType::getSerialization(name_and_type); + + settings.getter = [&] (const ISerialization::SubstreamPath & path) -> ReadBuffer * { - if (cache.count(IDataType::getSubcolumnNameForStream(path))) + if (cache.count(ISerialization::getSubcolumnNameForStream(path))) return nullptr; - String stream_name = IDataType::getFileNameForStream(name_and_type, path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, path); auto & stream = streams[stream_name]; if (!stream) { @@ -184,9 +186,9 @@ void TinyLogSource::readData(const NameAndTypePair & name_and_type, }; if (deserialize_states.count(name) == 0) - type->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]); + serialization->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]); - type->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, deserialize_states[name], &cache); + serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, deserialize_states[name], &cache); } @@ -261,24 +263,24 @@ private: using FileStreams = std::map>; FileStreams streams; - using SerializeState = IDataType::SerializeBinaryBulkStatePtr; + using SerializeState = ISerialization::SerializeBinaryBulkStatePtr; using SerializeStates = std::map; SerializeStates serialize_states; using WrittenStreams = std::set; - IDataType::OutputStreamGetter createStreamGetter(const NameAndTypePair & column, WrittenStreams & written_streams); + ISerialization::OutputStreamGetter createStreamGetter(const NameAndTypePair & column, WrittenStreams & written_streams); void writeData(const NameAndTypePair & name_and_type, const IColumn & column, WrittenStreams & written_streams); }; -IDataType::OutputStreamGetter TinyLogBlockOutputStream::createStreamGetter( +ISerialization::OutputStreamGetter TinyLogBlockOutputStream::createStreamGetter( const NameAndTypePair & column, WrittenStreams & written_streams) { - return [&] (const IDataType::SubstreamPath & path) -> WriteBuffer * + return [&] (const ISerialization::SubstreamPath & path) -> WriteBuffer * { - String stream_name = IDataType::getFileNameForStream(column, path); + String stream_name = ISerialization::getFileNameForStream(column, path); if (!written_streams.insert(stream_name).second) return nullptr; @@ -298,8 +300,9 @@ IDataType::OutputStreamGetter TinyLogBlockOutputStream::createStreamGetter( void TinyLogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, const IColumn & column, WrittenStreams & written_streams) { - IDataType::SerializeBinaryBulkSettings settings; + ISerialization::SerializeBinaryBulkSettings settings; const auto & [name, type] = name_and_type; + auto serialization = type->getDefaultSerialization(); if (serialize_states.count(name) == 0) { @@ -307,11 +310,11 @@ void TinyLogBlockOutputStream::writeData(const NameAndTypePair & name_and_type, /// Use different WrittenStreams set, or we get nullptr for them in `serializeBinaryBulkWithMultipleStreams` WrittenStreams prefix_written_streams; settings.getter = createStreamGetter(name_and_type, prefix_written_streams); - type->serializeBinaryBulkStatePrefix(settings, serialize_states[name]); + serialization->serializeBinaryBulkStatePrefix(settings, serialize_states[name]); } settings.getter = createStreamGetter(name_and_type, written_streams); - type->serializeBinaryBulkWithMultipleStreams(column, 0, 0, settings, serialize_states[name]); + serialization->serializeBinaryBulkWithMultipleStreams(column, 0, 0, settings, serialize_states[name]); } @@ -328,14 +331,15 @@ void TinyLogBlockOutputStream::writeSuffix() } WrittenStreams written_streams; - IDataType::SerializeBinaryBulkSettings settings; + ISerialization::SerializeBinaryBulkSettings settings; for (const auto & column : getHeader()) { auto it = serialize_states.find(column.name); if (it != serialize_states.end()) { settings.getter = createStreamGetter(NameAndTypePair(column.name, column.type), written_streams); - column.type->serializeBinaryBulkStateSuffix(settings, it->second); + auto serialization = column.type->getDefaultSerialization(); + serialization->serializeBinaryBulkStateSuffix(settings, it->second); } } @@ -423,9 +427,9 @@ void StorageTinyLog::addFiles(const NameAndTypePair & column) throw Exception("Duplicate column with name " + name + " in constructor of StorageTinyLog.", ErrorCodes::DUPLICATE_COLUMN); - IDataType::StreamCallback stream_callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + ISerialization::StreamCallback stream_callback = [&] (const ISerialization::SubstreamPath & substream_path) { - String stream_name = IDataType::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(column, substream_path); if (!files.count(stream_name)) { ColumnData column_data; @@ -434,8 +438,9 @@ void StorageTinyLog::addFiles(const NameAndTypePair & column) } }; - IDataType::SubstreamPath substream_path; - type->enumerateStreams(stream_callback, substream_path); + ISerialization::SubstreamPath substream_path; + auto serialization = type->getDefaultSerialization(); + serialization->enumerateStreams(stream_callback, substream_path); } diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index 132ed234323118b652af2d47d564455cf58c3981..fffe909715ffaf17905480b724ffa1901e08ddf0 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -359,6 +359,7 @@ protected: { auto & create = ast->as(); create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; } if (columns_mask[src_index++]) diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp index 99dfc55ed696235f92004e940eb3ed44bfea1262..d40c62fef6025c5eace02e7ff997c2e2e245f321 100644 --- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp +++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp @@ -23,15 +23,6 @@ struct State State(const State&) = delete; Context context; - NamesAndTypesList columns{ - {"column", std::make_shared()}, - {"apply_id", std::make_shared()}, - {"apply_type", std::make_shared()}, - {"apply_status", std::make_shared()}, - {"create_time", std::make_shared()}, - {"field", std::make_shared()}, - {"value", std::make_shared()}, - }; static const State & instance() { @@ -39,27 +30,83 @@ struct State return state; } + const NamesAndTypesList & getColumns() const + { + return tables[0].columns; + } + + std::vector getTables(size_t num = 0) const + { + std::vector res; + for (size_t i = 0; i < std::min(num, tables.size()); ++i) + res.push_back(tables[i]); + return res; + } + private: + + static DatabaseAndTableWithAlias createDBAndTable(String table_name) + { + DatabaseAndTableWithAlias res; + res.database = "test"; + res.table = table_name; + return res; + } + + const std::vector tables{ + TableWithColumnNamesAndTypes( + createDBAndTable("table"), + { + {"column", std::make_shared()}, + {"apply_id", std::make_shared()}, + {"apply_type", std::make_shared()}, + {"apply_status", std::make_shared()}, + {"create_time", std::make_shared()}, + {"field", std::make_shared()}, + {"value", std::make_shared()}, + }), + TableWithColumnNamesAndTypes( + createDBAndTable("table2"), + { + {"num", std::make_shared()}, + {"attr", std::make_shared()}, + }), + }; + explicit State() : context(getContext().context) { tryRegisterFunctions(); DatabasePtr database = std::make_shared("test", context); - database->attachTable("table", StorageMemory::create(StorageID("test", "table"), ColumnsDescription{columns}, ConstraintsDescription{})); - DatabaseCatalog::instance().attachDatabase("test", database); + + for (const auto & tab : tables) + { + const auto & table_name = tab.table.table; + const auto & db_name = tab.table.database; + database->attachTable( + table_name, + StorageMemory::create(StorageID(db_name, table_name), ColumnsDescription{getColumns()}, ConstraintsDescription{})); + } + DatabaseCatalog::instance().attachDatabase(database->getDatabaseName(), database); context.setCurrentDatabase("test"); } }; - -static void check(const std::string & query, const std::string & expected, const Context & context, const NamesAndTypesList & columns) +static void check( + const State & state, + size_t table_num, + const std::string & query, + const std::string & expected) { ParserSelectQuery parser; ASTPtr ast = parseQuery(parser, query, 1000, 1000); SelectQueryInfo query_info; - query_info.syntax_analyzer_result = TreeRewriter(context).analyzeSelect(ast, columns); + SelectQueryOptions select_options; + query_info.syntax_analyzer_result + = TreeRewriter(state.context).analyzeSelect(ast, state.getColumns(), select_options, state.getTables(table_num)); query_info.query = ast; - std::string transformed_query = transformQueryForExternalDatabase(query_info, columns, IdentifierQuotingStyle::DoubleQuotes, "test", "table", context); + std::string transformed_query = transformQueryForExternalDatabase( + query_info, state.getColumns(), IdentifierQuotingStyle::DoubleQuotes, "test", "table", state.context); EXPECT_EQ(transformed_query, expected); } @@ -69,82 +116,93 @@ TEST(TransformQueryForExternalDatabase, InWithSingleElement) { const State & state = State::instance(); - check("SELECT column FROM test.table WHERE 1 IN (1)", - R"(SELECT "column" FROM "test"."table" WHERE 1)", - state.context, state.columns); - check("SELECT column FROM test.table WHERE column IN (1, 2)", - R"(SELECT "column" FROM "test"."table" WHERE "column" IN (1, 2))", - state.context, state.columns); - check("SELECT column FROM test.table WHERE column NOT IN ('hello', 'world')", - R"(SELECT "column" FROM "test"."table" WHERE "column" NOT IN ('hello', 'world'))", - state.context, state.columns); + check(state, 1, + "SELECT column FROM test.table WHERE 1 IN (1)", + R"(SELECT "column" FROM "test"."table" WHERE 1)"); + check(state, 1, + "SELECT column FROM test.table WHERE column IN (1, 2)", + R"(SELECT "column" FROM "test"."table" WHERE "column" IN (1, 2))"); + check(state, 1, + "SELECT column FROM test.table WHERE column NOT IN ('hello', 'world')", + R"(SELECT "column" FROM "test"."table" WHERE "column" NOT IN ('hello', 'world'))"); } TEST(TransformQueryForExternalDatabase, InWithTable) { const State & state = State::instance(); - check("SELECT column FROM test.table WHERE 1 IN external_table", - R"(SELECT "column" FROM "test"."table")", - state.context, state.columns); - check("SELECT column FROM test.table WHERE 1 IN (x)", - R"(SELECT "column" FROM "test"."table")", - state.context, state.columns); - check("SELECT column, field, value FROM test.table WHERE column IN (field, value)", - R"(SELECT "column", "field", "value" FROM "test"."table" WHERE "column" IN ("field", "value"))", - state.context, state.columns); - check("SELECT column FROM test.table WHERE column NOT IN hello AND column = 123", - R"(SELECT "column" FROM "test"."table" WHERE ("column" = 123))", - state.context, state.columns); + check(state, 1, + "SELECT column FROM test.table WHERE 1 IN external_table", + R"(SELECT "column" FROM "test"."table")"); + check(state, 1, + "SELECT column FROM test.table WHERE 1 IN (x)", + R"(SELECT "column" FROM "test"."table")"); + check(state, 1, + "SELECT column, field, value FROM test.table WHERE column IN (field, value)", + R"(SELECT "column", "field", "value" FROM "test"."table" WHERE "column" IN ("field", "value"))"); + check(state, 1, + "SELECT column FROM test.table WHERE column NOT IN hello AND column = 123", + R"(SELECT "column" FROM "test"."table" WHERE "column" = 123)"); } TEST(TransformQueryForExternalDatabase, Like) { const State & state = State::instance(); - check("SELECT column FROM test.table WHERE column LIKE '%hello%'", - R"(SELECT "column" FROM "test"."table" WHERE "column" LIKE '%hello%')", - state.context, state.columns); - check("SELECT column FROM test.table WHERE column NOT LIKE 'w%rld'", - R"(SELECT "column" FROM "test"."table" WHERE "column" NOT LIKE 'w%rld')", - state.context, state.columns); + check(state, 1, + "SELECT column FROM test.table WHERE column LIKE '%hello%'", + R"(SELECT "column" FROM "test"."table" WHERE "column" LIKE '%hello%')"); + check(state, 1, + "SELECT column FROM test.table WHERE column NOT LIKE 'w%rld'", + R"(SELECT "column" FROM "test"."table" WHERE "column" NOT LIKE 'w%rld')"); } TEST(TransformQueryForExternalDatabase, Substring) { const State & state = State::instance(); - check("SELECT column FROM test.table WHERE left(column, 10) = RIGHT(column, 10) AND SUBSTRING(column FROM 1 FOR 2) = 'Hello'", - R"(SELECT "column" FROM "test"."table")", - state.context, state.columns); + check(state, 1, + "SELECT column FROM test.table WHERE left(column, 10) = RIGHT(column, 10) AND SUBSTRING(column FROM 1 FOR 2) = 'Hello'", + R"(SELECT "column" FROM "test"."table")"); } TEST(TransformQueryForExternalDatabase, MultipleAndSubqueries) { const State & state = State::instance(); - check("SELECT column FROM test.table WHERE 1 = 1 AND toString(column) = '42' AND column = 42 AND left(column, 10) = RIGHT(column, 10) AND column IN (1, 42) AND SUBSTRING(column FROM 1 FOR 2) = 'Hello' AND column != 4", - R"(SELECT "column" FROM "test"."table" WHERE 1 AND ("column" = 42) AND ("column" IN (1, 42)) AND ("column" != 4))", - state.context, state.columns); - check("SELECT column FROM test.table WHERE toString(column) = '42' AND left(column, 10) = RIGHT(column, 10) AND column = 42", - R"(SELECT "column" FROM "test"."table" WHERE ("column" = 42))", - state.context, state.columns); + check(state, 1, + "SELECT column FROM test.table WHERE 1 = 1 AND toString(column) = '42' AND column = 42 AND left(column, 10) = RIGHT(column, 10) AND column IN (1, 42) AND SUBSTRING(column FROM 1 FOR 2) = 'Hello' AND column != 4", + R"(SELECT "column" FROM "test"."table" WHERE 1 AND ("column" = 42) AND ("column" IN (1, 42)) AND ("column" != 4))"); + check(state, 1, + "SELECT column FROM test.table WHERE toString(column) = '42' AND left(column, 10) = RIGHT(column, 10) AND column = 42", + R"(SELECT "column" FROM "test"."table" WHERE ("column" = 42))"); } TEST(TransformQueryForExternalDatabase, Issue7245) { const State & state = State::instance(); - check("select apply_id from test.table where apply_type = 2 and create_time > addDays(toDateTime('2019-01-01 01:02:03'),-7) and apply_status in (3,4)", - R"(SELECT "apply_id", "apply_type", "apply_status", "create_time" FROM "test"."table" WHERE ("apply_type" = 2) AND ("create_time" > '2018-12-25 01:02:03') AND ("apply_status" IN (3, 4)))", - state.context, state.columns); + check(state, 1, + "SELECT apply_id FROM test.table WHERE apply_type = 2 AND create_time > addDays(toDateTime('2019-01-01 01:02:03'),-7) AND apply_status IN (3,4)", + R"(SELECT "apply_id", "apply_type", "apply_status", "create_time" FROM "test"."table" WHERE ("apply_type" = 2) AND ("create_time" > '2018-12-25 01:02:03') AND ("apply_status" IN (3, 4)))"); } TEST(TransformQueryForExternalDatabase, Aliases) { const State & state = State::instance(); - check("SELECT field AS value, field AS display WHERE field NOT IN ('') AND display LIKE '%test%'", - R"(SELECT "field" FROM "test"."table" WHERE ("field" NOT IN ('')) AND ("field" LIKE '%test%'))", - state.context, state.columns); + check(state, 1, + "SELECT field AS value, field AS display WHERE field NOT IN ('') AND display LIKE '%test%'", + R"(SELECT "field" FROM "test"."table" WHERE ("field" NOT IN ('')) AND ("field" LIKE '%test%'))"); +} + +TEST(TransformQueryForExternalDatabase, ForeignColumnInWhere) +{ + const State & state = State::instance(); + + check(state, 2, + "SELECT column FROM test.table " + "JOIN test.table2 AS table2 ON (test.table.apply_id = table2.num) " + "WHERE column > 2 AND (apply_id = 1 OR table2.num = 1) AND table2.attr != ''", + R"(SELECT "column", "apply_id" FROM "test"."table" WHERE ("column" > 2) AND ("apply_id" = 1))"); } diff --git a/src/Storages/transformQueryForExternalDatabase.cpp b/src/Storages/transformQueryForExternalDatabase.cpp index 42daf8cfc267c7c5273f9691823757d1a2bd8cbe..59d357f72e6d8d2d75115e3419b9938db3770d85 100644 --- a/src/Storages/transformQueryForExternalDatabase.cpp +++ b/src/Storages/transformQueryForExternalDatabase.cpp @@ -63,7 +63,7 @@ public: const IColumn & inner_column = assert_cast(*result.column).getDataColumn(); WriteBufferFromOwnString out; - result.type->serializeAsText(inner_column, 0, out, FormatSettings()); + result.type->getDefaultSerialization()->serializeText(inner_column, 0, out, FormatSettings()); node = std::make_shared(out.str()); } } @@ -160,8 +160,78 @@ bool isCompatible(const IAST & node) return node.as(); } +bool removeUnknownSubexpressions(ASTPtr & node, const NameSet & known_names); + +void removeUnknownChildren(ASTs & children, const NameSet & known_names) +{ + + ASTs new_children; + for (auto & child : children) + { + bool leave_child = removeUnknownSubexpressions(child, known_names); + if (leave_child) + new_children.push_back(child); + } + children = std::move(new_children); +} + +/// return `true` if we should leave node in tree +bool removeUnknownSubexpressions(ASTPtr & node, const NameSet & known_names) +{ + if (const auto * ident = node->as()) + return known_names.contains(ident->name()); + + if (node->as() != nullptr) + return true; + + auto * func = node->as(); + if (func && (func->name == "and" || func->name == "or")) + { + removeUnknownChildren(func->arguments->children, known_names); + /// all children removed, current node can be removed too + if (func->arguments->children.size() == 1) + { + /// if only one child left, pull it on top level + node = func->arguments->children[0]; + return true; + } + return !func->arguments->children.empty(); + } + + bool leave_child = true; + for (auto & child : node->children) + { + leave_child = leave_child && removeUnknownSubexpressions(child, known_names); + if (!leave_child) + break; + } + return leave_child; } +// When a query references an external table such as table from MySQL database, +// the corresponding table storage has to execute the relevant part of the query. We +// send the query to the storage as AST. Before that, we have to remove the conditions +// that reference other tables from `WHERE`, so that the external engine is not confused +// by the unknown columns. +bool removeUnknownSubexpressionsFromWhere(ASTPtr & node, const NamesAndTypesList & available_columns) +{ + if (!node) + return false; + + NameSet known_names; + for (const auto & col : available_columns) + known_names.insert(col.name); + + if (auto * expr_list = node->as(); expr_list && !expr_list->children.empty()) + { + /// traverse expression list on top level + removeUnknownChildren(expr_list->children, known_names); + return !expr_list->children.empty(); + } + return removeUnknownSubexpressions(node, known_names); +} + +} String transformQueryForExternalDatabase( const SelectQueryInfo & query_info, @@ -191,7 +261,8 @@ String transformQueryForExternalDatabase( */ ASTPtr original_where = clone_query->as().where(); - if (original_where) + bool where_has_known_columns = removeUnknownSubexpressionsFromWhere(original_where, available_columns); + if (original_where && where_has_known_columns) { replaceConstantExpressions(original_where, context, available_columns); diff --git a/tests/config/config.d/clusters.xml b/tests/config/config.d/clusters.xml index c0babf0ff89a551abd4977662daf145c4c4b4ab3..b783372f83b30afd3809f2d03d34d61a39d270ed 100644 --- a/tests/config/config.d/clusters.xml +++ b/tests/config/config.d/clusters.xml @@ -17,4 +17,4 @@ - \ No newline at end of file + diff --git a/tests/config/users.d/database_replicated.xml b/tests/config/users.d/database_replicated.xml index 23801d001548e46dab7bb245c0b22cd4f3e13a6a..903b8a64e225c79a1bdfc7b83c8ab03872a6251c 100644 --- a/tests/config/users.d/database_replicated.xml +++ b/tests/config/users.d/database_replicated.xml @@ -2,9 +2,11 @@ 1 - 0 + none 30 30 + 1 + 2 diff --git a/tests/integration/test_mysql_database_engine/test.py b/tests/integration/test_mysql_database_engine/test.py index 4d10e2ea6f5a3943ce2f880c0f4b67e1e98e9b0f..f4b0bb1b2fc4409c4cdd15c4e041248a8b3315c3 100644 --- a/tests/integration/test_mysql_database_engine/test.py +++ b/tests/integration/test_mysql_database_engine/test.py @@ -146,10 +146,14 @@ def test_clickhouse_join_for_mysql_database(started_cluster): "CREATE TABLE default.t1_remote_mysql AS mysql('mysql1:3306','test','t1_mysql_local','root','clickhouse')") clickhouse_node.query( "CREATE TABLE default.t2_remote_mysql AS mysql('mysql1:3306','test','t2_mysql_local','root','clickhouse')") + clickhouse_node.query("INSERT INTO `default`.`t1_remote_mysql` VALUES ('EN','A',''),('RU','B','AAA')") + clickhouse_node.query("INSERT INTO `default`.`t2_remote_mysql` VALUES ('A','AAA'),('Z','')") + assert clickhouse_node.query("SELECT s.pays " "FROM default.t1_remote_mysql AS s " "LEFT JOIN default.t1_remote_mysql AS s_ref " - "ON (s_ref.opco = s.opco AND s_ref.service = s.service)") == '' + "ON (s_ref.opco = s.opco AND s_ref.service = s.service) " + "WHERE s_ref.opco != '' AND s.opco != '' ").rstrip() == 'RU' mysql_node.query("DROP DATABASE test") diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml index ebceee3aa5c867b933d4f5e19e012ab472730747..d751454437cdb082e031189f75a7c472689846a2 100644 --- a/tests/integration/test_replicated_database/configs/config.xml +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -1,34 +1,3 @@ 10 - - - - - true - - main_node - 9000 - - - dummy_node - 9000 - - - competing_node - 9000 - - - - true - - snapshotting_node - 9000 - - - snapshot_recovering_node - 9000 - - - - diff --git a/tests/integration/test_replicated_database/configs/settings.xml b/tests/integration/test_replicated_database/configs/settings.xml index e0f7e8691e663eedb9fec467826dbb2669163b61..7f45502e20d4063f1a9514a8750f8cfbd57c98b2 100644 --- a/tests/integration/test_replicated_database/configs/settings.xml +++ b/tests/integration/test_replicated_database/configs/settings.xml @@ -2,6 +2,7 @@ 1 + 1 diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 99e7d6077f829742626d1f3570cb1ab737ba3c05..f02457b144adbd60e6a41d2d79886d338e883b0b 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -99,16 +99,20 @@ def test_alters_from_different_replicas(started_cluster): "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(cluster, testdb, concurrent_test, CounterID)") + main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(testdb, testdb, concurrent_test, CounterID)") dummy_node.stop_clickhouse(kill=True) - settings = {"distributed_ddl_task_timeout": 10} + settings = {"distributed_ddl_task_timeout": 5} assert "There are 1 unfinished hosts (0 of them are currently active)" in \ competing_node.query_and_get_error("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;", settings=settings) + settings = {"distributed_ddl_task_timeout": 5, "distributed_ddl_output_mode": "null_status_on_timeout"} + assert "shard1|replica2\t\\N\t\\N" in \ + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;", settings=settings) + settings = {"distributed_ddl_task_timeout": 5, "distributed_ddl_output_mode": "never_throw"} + assert "shard1|replica2\t\\N\t\\N" in \ + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;", settings=settings) dummy_node.start_clickhouse() - main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") - competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") @@ -198,8 +202,14 @@ def test_recover_staled_replica(started_cluster): dummy_node.query("CREATE TABLE recover.rmt2 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) main_node.query("CREATE TABLE recover.rmt3 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) dummy_node.query("CREATE TABLE recover.rmt5 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) - main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") - dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") + main_node.query("CREATE MATERIALIZED VIEW recover.mv1 (n int) ENGINE=ReplicatedMergeTree order by n AS SELECT n FROM recover.rmt1", settings=settings) + dummy_node.query("CREATE MATERIALIZED VIEW recover.mv2 (n int) ENGINE=ReplicatedMergeTree order by n AS SELECT n FROM recover.rmt2", settings=settings) + main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n " + "SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) " + "LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") + dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n " + "SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) " + "LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt3', 'rmt5']: main_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) @@ -217,35 +227,44 @@ def test_recover_staled_replica(started_cluster): main_node.query("RENAME TABLE recover.rmt3 TO recover.rmt4", settings=settings) main_node.query("DROP TABLE recover.rmt5", settings=settings) main_node.query("DROP DICTIONARY recover.d2", settings=settings) - main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) + main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n " + "SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) " + "LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) + + inner_table = ".inner_id." + dummy_node.query("SELECT uuid FROM system.tables WHERE database='recover' AND name='mv1'").strip() + main_node.query("ALTER TABLE recover.`{}` MODIFY COLUMN n int DEFAULT 42".format(inner_table), settings=settings) + main_node.query("ALTER TABLE recover.mv1 MODIFY QUERY SELECT m FROM recover.rmt1".format(inner_table), settings=settings) + main_node.query("RENAME TABLE recover.mv2 TO recover.mv3".format(inner_table), settings=settings) - main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) - main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) - assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nrmt4\nt2\ntmp\n" - query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' ORDER BY name" + assert main_node.query("SELECT name FROM system.tables WHERE database='recover' AND name NOT LIKE '.inner_id.%' ORDER BY name") == \ + "d1\nd2\nm1\nmt1\nmt2\nmv1\nmv3\nrmt1\nrmt2\nrmt4\nt2\ntmp\n" + query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' AND name NOT LIKE '.inner_id.%' " \ + "ORDER BY name SETTINGS show_table_uuid_in_table_create_query_if_not_nil=1" expected = main_node.query(query) assert_eq_with_retry(dummy_node, query, expected) + assert main_node.query("SELECT count() FROM system.tables WHERE database='recover' AND name LIKE '.inner_id.%'") == "2\n" + assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover' AND name LIKE '.inner_id.%'") == "2\n" - for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2']: + for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mv1', 'mv3']: assert main_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" - for table in ['t2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mt2']: + for table in ['t2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mt2', 'mv1', 'mv3']: assert dummy_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" for table in ['m1', 'mt1']: assert dummy_node.query("SELECT count() FROM recover.{}".format(table)) == "0\n" assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "2\n" - table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'mt1_26_%'").strip() + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'mt1_29_%'").strip() assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" - table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'rmt5_26_%'").strip() + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'rmt5_29_%'").strip() assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" - expected = "Cleaned 4 outdated objects: dropped 1 dictionaries and 1 tables, moved 2 tables" + expected = "Cleaned 6 outdated objects: dropped 1 dictionaries and 3 tables, moved 2 tables" assert_logs_contain(dummy_node, expected) dummy_node.query("DROP TABLE recover.tmp") diff --git a/tests/queries/0_stateless/00189_time_zones.reference b/tests/queries/0_stateless/00189_time_zones_long.reference similarity index 98% rename from tests/queries/0_stateless/00189_time_zones.reference rename to tests/queries/0_stateless/00189_time_zones_long.reference index e7e5a71782aa8836c7366409bdb274dbf6386098..df42e8f1b6ed51297307d917473f020be398016d 100644 --- a/tests/queries/0_stateless/00189_time_zones.reference +++ b/tests/queries/0_stateless/00189_time_zones_long.reference @@ -148,9 +148,9 @@ toStartOfInterval 2019-02-05 00:00:00 2019-02-03 00:00:00 2019-02-06 22:00:00 -2019-02-06 21:00:00 -2019-02-06 21:00:00 -2019-02-06 03:00:00 +2019-02-06 22:00:00 +2019-02-06 18:00:00 +2019-02-06 00:00:00 2019-02-06 22:57:00 2019-02-06 22:56:00 2019-02-06 22:55:00 diff --git a/tests/queries/0_stateless/00189_time_zones.sql b/tests/queries/0_stateless/00189_time_zones_long.sql similarity index 100% rename from tests/queries/0_stateless/00189_time_zones.sql rename to tests/queries/0_stateless/00189_time_zones_long.sql diff --git a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference new file mode 100644 index 0000000000000000000000000000000000000000..0cc8c788fed189aeacdbb5c354acc5e8974d4e51 --- /dev/null +++ b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference @@ -0,0 +1,25 @@ +none +Received exception from server: +Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57, e.displayText() = Error: Table default.throw already exists +Received exception from server: +Code: 159. Error: Received from localhost:9000. Error: Watching task is executing longer than distributed_ddl_task_timeout (=8) seconds. There are 1 unfinished hosts (0 of them are currently active), they are going to execute the query in background. +throw +localhost 9000 0 0 0 +localhost 9000 57 Code: 57, e.displayText() = Error: Table default.throw already exists. 0 0 +Received exception from server: +Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57, e.displayText() = Error: Table default.throw already exists +localhost 9000 0 1 0 +Received exception from server: +Code: 159. Error: Received from localhost:9000. Error: Watching task is executing longer than distributed_ddl_task_timeout (=8) seconds. There are 1 unfinished hosts (0 of them are currently active), they are going to execute the query in background. +null_status_on_timeout +localhost 9000 0 0 0 +localhost 9000 57 Code: 57, e.displayText() = Error: Table default.null_status already exists. 0 0 +Received exception from server: +Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57, e.displayText() = Error: Table default.null_status already exists +localhost 9000 0 1 0 +localhost 1 \N \N 1 0 +never_throw +localhost 9000 0 0 0 +localhost 9000 57 Code: 57, e.displayText() = Error: Table default.never_throw already exists. 0 0 +localhost 9000 0 1 0 +localhost 1 \N \N 1 0 diff --git a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh new file mode 100755 index 0000000000000000000000000000000000000000..66ceef2168271849a188efc810dc167e88bf4910 --- /dev/null +++ b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "drop table if exists throw;" +$CLICKHOUSE_CLIENT -q "drop table if exists null_status;" +$CLICKHOUSE_CLIENT -q "drop table if exists never_throw;" + +CLICKHOUSE_CLIENT_OPT=$(echo ${CLICKHOUSE_CLIENT_OPT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=fatal/g') + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=none" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +# Ok +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" +# Table exists +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/exists.. /exists/" +# Timeout +$CLIENT -q "drop table throw on cluster test_unavailable_shard;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/Watching task .* is executing longer/Watching task is executing longer/" | sed "s/background. /background./" + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=throw" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/exists.. /exists/" +$CLIENT -q "drop table throw on cluster test_unavailable_shard;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/Watching task .* is executing longer/Watching task is executing longer/" | sed "s/background. /background./" + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=null_status_on_timeout" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +$CLIENT -q "create table null_status on cluster test_shard_localhost (n int) engine=Memory;" +$CLIENT -q "create table null_status on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/exists.. /exists/" +$CLIENT -q "drop table null_status on cluster test_unavailable_shard;" + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=never_throw" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +$CLIENT -q "create table never_throw on cluster test_shard_localhost (n int) engine=Memory;" +$CLIENT -q "create table never_throw on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" +$CLIENT -q "drop table never_throw on cluster test_unavailable_shard;" diff --git a/tests/queries/0_stateless/01566_negate_formatting.reference b/tests/queries/0_stateless/01566_negate_formatting.reference new file mode 100644 index 0000000000000000000000000000000000000000..b955d4cbbc5fac2c5bfd5e07c091b15fbebd7835 --- /dev/null +++ b/tests/queries/0_stateless/01566_negate_formatting.reference @@ -0,0 +1,20 @@ +-- { echo } +explain syntax select negate(1), negate(-1), - -1, -(-1), (-1) in (-1); +SELECT + -1, + 1, + 1, + 1, + -1 IN (-1) +explain syntax select negate(1.), negate(-1.), - -1., -(-1.), (-1.) in (-1.); +SELECT + -1., + 1, + 1, + 1, + -1. IN (-1.) +explain syntax select negate(-9223372036854775808), -(-9223372036854775808), - -9223372036854775808; +SELECT + -9223372036854775808, + -9223372036854775808, + -9223372036854775808 diff --git a/tests/queries/0_stateless/01566_negate_formatting.sql b/tests/queries/0_stateless/01566_negate_formatting.sql new file mode 100644 index 0000000000000000000000000000000000000000..035ff80e8d86d95da2c619712253cd905ab9ebef --- /dev/null +++ b/tests/queries/0_stateless/01566_negate_formatting.sql @@ -0,0 +1,4 @@ +-- { echo } +explain syntax select negate(1), negate(-1), - -1, -(-1), (-1) in (-1); +explain syntax select negate(1.), negate(-1.), - -1., -(-1.), (-1.) in (-1.); +explain syntax select negate(-9223372036854775808), -(-9223372036854775808), - -9223372036854775808; diff --git a/tests/queries/0_stateless/01666_blns.sql b/tests/queries/0_stateless/01666_blns.sql index 5d7cc4881bb8cb3e94295632628ac88e45d46b43..be9632092bc48523a5aa5a2a79c8f2f3af91f117 100644 --- a/tests/queries/0_stateless/01666_blns.sql +++ b/tests/queries/0_stateless/01666_blns.sql @@ -554,9 +554,9 @@ SELECT count() FROM test; DROP TABLE IF EXISTS test_r1; DROP TABLE IF EXISTS test_r2; -CREATE TABLE test_r1 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test/01666_blns', 'r1') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; +CREATE TABLE test_r1 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test_01666', 'r1') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; INSERT INTO test_r1 SELECT * FROM test; -CREATE TABLE test_r2 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test/01666_blns', 'r2') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; +CREATE TABLE test_r2 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test_01666', 'r2') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; SYSTEM SYNC REPLICA test_r2; diff --git a/tests/queries/0_stateless/01757_optimize_skip_unused_shards_limit.reference b/tests/queries/0_stateless/01757_optimize_skip_unused_shards_limit.reference new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/queries/0_stateless/01757_optimize_skip_unused_shards_limit.sql b/tests/queries/0_stateless/01757_optimize_skip_unused_shards_limit.sql new file mode 100644 index 0000000000000000000000000000000000000000..68247dbfbe5dcba9954cf89dd0398b936f810517 --- /dev/null +++ b/tests/queries/0_stateless/01757_optimize_skip_unused_shards_limit.sql @@ -0,0 +1,33 @@ +drop table if exists dist_01757; +create table dist_01757 as system.one engine=Distributed(test_cluster_two_shards, system, one, dummy); + +set optimize_skip_unused_shards=1; +set force_optimize_skip_unused_shards=2; + +-- in +select * from dist_01757 where dummy in (0,) format Null; +select * from dist_01757 where dummy in (0, 1) format Null settings optimize_skip_unused_shards_limit=2; + +-- in negative +select * from dist_01757 where dummy in (0, 1) settings optimize_skip_unused_shards_limit=1; -- { serverError 507 } + +-- or negative +select * from dist_01757 where dummy = 0 or dummy = 1 settings optimize_skip_unused_shards_limit=1; -- { serverError 507 } + +-- or +select * from dist_01757 where dummy = 0 or dummy = 1 format Null settings optimize_skip_unused_shards_limit=2; + +-- and negative +select * from dist_01757 where dummy = 0 and dummy = 1 settings optimize_skip_unused_shards_limit=1; -- { serverError 507 } +select * from dist_01757 where dummy = 0 and dummy = 2 and dummy = 3 settings optimize_skip_unused_shards_limit=1; -- { serverError 507 } +select * from dist_01757 where dummy = 0 and dummy = 2 and dummy = 3 settings optimize_skip_unused_shards_limit=2; -- { serverError 507 } + +-- and +select * from dist_01757 where dummy = 0 and dummy = 1 settings optimize_skip_unused_shards_limit=2; +select * from dist_01757 where dummy = 0 and dummy = 1 and dummy = 3 settings optimize_skip_unused_shards_limit=3; + +-- ARGUMENT_OUT_OF_BOUND error +select * from dist_01757 where dummy in (0, 1) settings optimize_skip_unused_shards_limit=0; -- { serverError 69 } +select * from dist_01757 where dummy in (0, 1) settings optimize_skip_unused_shards_limit=9223372036854775808; -- { serverError 69 } + +drop table dist_01757; diff --git a/tests/queries/0_stateless/01772_to_start_of_hour_align.reference b/tests/queries/0_stateless/01772_to_start_of_hour_align.reference new file mode 100644 index 0000000000000000000000000000000000000000..f130df3bef5599f52fd43f998e38aab44d5dd6ca --- /dev/null +++ b/tests/queries/0_stateless/01772_to_start_of_hour_align.reference @@ -0,0 +1,86 @@ +2021-03-23 00:00:00 +2021-03-23 11:00:00 +2021-03-23 22:00:00 +2021-03-23 13:00:00 +2021-03-23 12:00:00 +2021-03-23 00:00:00 +2010-03-28 00:00:00 2010-03-28 00:00:00 1269723600 +2010-03-28 00:15:00 2010-03-28 00:00:00 1269724500 +2010-03-28 00:30:00 2010-03-28 00:00:00 1269725400 +2010-03-28 00:45:00 2010-03-28 00:00:00 1269726300 +2010-03-28 01:00:00 2010-03-28 00:00:00 1269727200 +2010-03-28 01:15:00 2010-03-28 00:00:00 1269728100 +2010-03-28 01:30:00 2010-03-28 00:00:00 1269729000 +2010-03-28 01:45:00 2010-03-28 00:00:00 1269729900 +2010-03-28 03:00:00 2010-03-28 03:00:00 1269730800 +2010-03-28 03:15:00 2010-03-28 03:00:00 1269731700 +2010-03-28 03:30:00 2010-03-28 03:00:00 1269732600 +2010-03-28 03:45:00 2010-03-28 03:00:00 1269733500 +2010-03-28 04:00:00 2010-03-28 04:00:00 1269734400 +2010-03-28 04:15:00 2010-03-28 04:00:00 1269735300 +2010-03-28 04:30:00 2010-03-28 04:00:00 1269736200 +2010-03-28 04:45:00 2010-03-28 04:00:00 1269737100 +2010-03-28 05:00:00 2010-03-28 04:00:00 1269738000 +2010-03-28 05:15:00 2010-03-28 04:00:00 1269738900 +2010-03-28 05:30:00 2010-03-28 04:00:00 1269739800 +2010-03-28 05:45:00 2010-03-28 04:00:00 1269740700 +2010-10-31 00:00:00 2010-10-31 00:00:00 1288468800 +2010-10-31 00:15:00 2010-10-31 00:00:00 1288469700 +2010-10-31 00:30:00 2010-10-31 00:00:00 1288470600 +2010-10-31 00:45:00 2010-10-31 00:00:00 1288471500 +2010-10-31 01:00:00 2010-10-31 00:00:00 1288472400 +2010-10-31 01:15:00 2010-10-31 00:00:00 1288473300 +2010-10-31 01:30:00 2010-10-31 00:00:00 1288474200 +2010-10-31 01:45:00 2010-10-31 00:00:00 1288475100 +2010-10-31 02:00:00 2010-10-31 02:00:00 1288476000 +2010-10-31 02:15:00 2010-10-31 02:00:00 1288476900 +2010-10-31 02:30:00 2010-10-31 02:00:00 1288477800 +2010-10-31 02:45:00 2010-10-31 02:00:00 1288478700 +2010-10-31 02:00:00 2010-10-31 02:00:00 1288479600 +2010-10-31 02:15:00 2010-10-31 02:00:00 1288480500 +2010-10-31 02:30:00 2010-10-31 02:00:00 1288481400 +2010-10-31 02:45:00 2010-10-31 02:00:00 1288482300 +2010-10-31 03:00:00 2010-10-31 02:00:00 1288483200 +2010-10-31 03:15:00 2010-10-31 02:00:00 1288484100 +2010-10-31 03:30:00 2010-10-31 02:00:00 1288485000 +2010-10-31 03:45:00 2010-10-31 02:00:00 1288485900 +2020-04-05 00:00:00 2020-04-05 00:00:00 1586005200 +2020-04-05 00:15:00 2020-04-05 00:00:00 1586006100 +2020-04-05 00:30:00 2020-04-05 00:00:00 1586007000 +2020-04-05 00:45:00 2020-04-05 00:00:00 1586007900 +2020-04-05 01:00:00 2020-04-05 00:00:00 1586008800 +2020-04-05 01:15:00 2020-04-05 00:00:00 1586009700 +2020-04-05 01:30:00 2020-04-05 00:00:00 1586010600 +2020-04-05 01:45:00 2020-04-05 00:00:00 1586011500 +2020-04-05 01:30:00 2020-04-05 00:00:00 1586012400 +2020-04-05 01:45:00 2020-04-05 00:00:00 1586013300 +2020-04-05 02:00:00 2020-04-05 02:00:00 1586014200 +2020-04-05 02:15:00 2020-04-05 02:00:00 1586015100 +2020-04-05 02:30:00 2020-04-05 02:00:00 1586016000 +2020-04-05 02:45:00 2020-04-05 02:00:00 1586016900 +2020-04-05 03:00:00 2020-04-05 02:00:00 1586017800 +2020-04-05 03:15:00 2020-04-05 02:00:00 1586018700 +2020-04-05 03:30:00 2020-04-05 02:00:00 1586019600 +2020-04-05 03:45:00 2020-04-05 02:00:00 1586020500 +2020-04-05 04:00:00 2020-04-05 04:00:00 1586021400 +2020-04-05 04:15:00 2020-04-05 04:00:00 1586022300 +2020-10-04 00:00:00 2020-10-04 00:00:00 1601731800 +2020-10-04 00:15:00 2020-10-04 00:00:00 1601732700 +2020-10-04 00:30:00 2020-10-04 00:00:00 1601733600 +2020-10-04 00:45:00 2020-10-04 00:00:00 1601734500 +2020-10-04 01:00:00 2020-10-04 00:00:00 1601735400 +2020-10-04 01:15:00 2020-10-04 00:00:00 1601736300 +2020-10-04 01:30:00 2020-10-04 00:00:00 1601737200 +2020-10-04 01:45:00 2020-10-04 00:00:00 1601738100 +2020-10-04 02:30:00 2020-10-04 02:30:00 1601739000 +2020-10-04 02:45:00 2020-10-04 02:30:00 1601739900 +2020-10-04 03:00:00 2020-10-04 02:30:00 1601740800 +2020-10-04 03:15:00 2020-10-04 02:30:00 1601741700 +2020-10-04 03:30:00 2020-10-04 02:30:00 1601742600 +2020-10-04 03:45:00 2020-10-04 02:30:00 1601743500 +2020-10-04 04:00:00 2020-10-04 04:00:00 1601744400 +2020-10-04 04:15:00 2020-10-04 04:00:00 1601745300 +2020-10-04 04:30:00 2020-10-04 04:00:00 1601746200 +2020-10-04 04:45:00 2020-10-04 04:00:00 1601747100 +2020-10-04 05:00:00 2020-10-04 04:00:00 1601748000 +2020-10-04 05:15:00 2020-10-04 04:00:00 1601748900 diff --git a/tests/queries/0_stateless/01772_to_start_of_hour_align.sql b/tests/queries/0_stateless/01772_to_start_of_hour_align.sql new file mode 100644 index 0000000000000000000000000000000000000000..6d1bb460f905677753ac45cc8a30b43be0de15ee --- /dev/null +++ b/tests/queries/0_stateless/01772_to_start_of_hour_align.sql @@ -0,0 +1,21 @@ +-- Rounding down to hour intervals is aligned to midnight even if the interval length does not divide the whole day. +SELECT toStartOfInterval(toDateTime('2021-03-23 03:58:00'), INTERVAL 11 HOUR); +SELECT toStartOfInterval(toDateTime('2021-03-23 13:58:00'), INTERVAL 11 HOUR); +SELECT toStartOfInterval(toDateTime('2021-03-23 23:58:00'), INTERVAL 11 HOUR); + +-- It should work correctly even in timezones with non-whole hours offset. India have +05:30. +SELECT toStartOfHour(toDateTime('2021-03-23 13:58:00', 'Asia/Kolkata')); +SELECT toStartOfInterval(toDateTime('2021-03-23 13:58:00', 'Asia/Kolkata'), INTERVAL 6 HOUR); + +-- Specifying the interval longer than 24 hours is not correct, but it works as expected by just rounding to midnight. +SELECT toStartOfInterval(toDateTime('2021-03-23 13:58:00', 'Asia/Kolkata'), INTERVAL 66 HOUR); + +-- In case of timezone shifts, rounding is performed to the hour number on "wall clock" time. +-- The intervals may become shorter or longer due to time shifts. For example, the three hour interval may actually last two hours. +-- If the same hour number on "wall clock" time correspond to multiple time points due to shifting backwards, the unspecified time point is selected among the candidates. +SELECT toDateTime('2010-03-28 00:00:00', 'Europe/Moscow') + INTERVAL 15 * number MINUTE AS src, toStartOfInterval(src, INTERVAL 2 HOUR) AS rounded, toUnixTimestamp(src) AS t FROM numbers(20); +SELECT toDateTime('2010-10-31 00:00:00', 'Europe/Moscow') + INTERVAL 15 * number MINUTE AS src, toStartOfInterval(src, INTERVAL 2 HOUR) AS rounded, toUnixTimestamp(src) AS t FROM numbers(20); + +-- And this should work even for non whole number of hours shifts. +SELECT toDateTime('2020-04-05 00:00:00', 'Australia/Lord_Howe') + INTERVAL 15 * number MINUTE AS src, toStartOfInterval(src, INTERVAL 2 HOUR) AS rounded, toUnixTimestamp(src) AS t FROM numbers(20); +SELECT toDateTime('2020-10-04 00:00:00', 'Australia/Lord_Howe') + INTERVAL 15 * number MINUTE AS src, toStartOfInterval(src, INTERVAL 2 HOUR) AS rounded, toUnixTimestamp(src) AS t FROM numbers(20); diff --git a/tests/queries/0_stateless/01773_case_sensitive_version.reference b/tests/queries/0_stateless/01773_case_sensitive_version.reference new file mode 100644 index 0000000000000000000000000000000000000000..72749c905a314fb1c4bdabd91f28aef935074b97 --- /dev/null +++ b/tests/queries/0_stateless/01773_case_sensitive_version.reference @@ -0,0 +1 @@ +1 1 1 diff --git a/tests/queries/0_stateless/01773_case_sensitive_version.sql b/tests/queries/0_stateless/01773_case_sensitive_version.sql new file mode 100644 index 0000000000000000000000000000000000000000..27fa1c27b2ac71779e4997c3076ef45b0f8a4803 --- /dev/null +++ b/tests/queries/0_stateless/01773_case_sensitive_version.sql @@ -0,0 +1 @@ +SELECT version()=Version(), VERSION()=Version(), vErSiOn()=VeRsIoN(); diff --git a/tests/queries/0_stateless/01774_case_sensitive_connection_id.reference b/tests/queries/0_stateless/01774_case_sensitive_connection_id.reference new file mode 100644 index 0000000000000000000000000000000000000000..95bea1a178ce7609a4a4e9c04a2d2d3e526ca6f6 --- /dev/null +++ b/tests/queries/0_stateless/01774_case_sensitive_connection_id.reference @@ -0,0 +1 @@ +0 0 0 0 0 0 diff --git a/tests/queries/0_stateless/01774_case_sensitive_connection_id.sql b/tests/queries/0_stateless/01774_case_sensitive_connection_id.sql new file mode 100644 index 0000000000000000000000000000000000000000..5a4f2b5853a07e5e8b675c43e4546bf23944a825 --- /dev/null +++ b/tests/queries/0_stateless/01774_case_sensitive_connection_id.sql @@ -0,0 +1 @@ +SELECT connection_id(), CONNECTION_ID(), CoNnEcTiOn_Id(), connectionid(), CONNECTIONID(), CoNnEcTiOnId(); diff --git a/tests/queries/0_stateless/helpers/httpechoserver.py b/tests/queries/0_stateless/helpers/httpechoserver.py deleted file mode 100644 index a1176c5e72d5c3667aa05a54e293f891b1e35e1c..0000000000000000000000000000000000000000 --- a/tests/queries/0_stateless/helpers/httpechoserver.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import os -import time -import subprocess -import threading -from io import StringIO, SEEK_END -from http.server import BaseHTTPRequestHandler, HTTPServer - -CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1') -CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') - -# IP-address of this host accessible from outside world. -HTTP_SERVER_HOST = os.environ.get('HTTP_SERVER_HOST', subprocess.check_output(['hostname', '-i']).decode('utf-8').strip()) -HTTP_SERVER_PORT = int(os.environ.get('CLICKHOUSE_TEST_HOST_EXPOSED_PORT', 51234)) - -# IP address and port of the HTTP server started from this script. -HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) -HTTP_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in HTTP_SERVER_ADDRESS) + "/" - -ostream = StringIO() -istream = sys.stdout - -class EchoCSVHTTPServer(BaseHTTPRequestHandler): - def _set_headers(self): - self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.end_headers() - - def do_GET(self): - self._set_headers() - with open(CSV_DATA, 'r') as fl: - ostream.seek(0) - for row in ostream: - self.wfile.write(row + '\n') - return - - def read_chunk(self): - msg = '' - while True: - sym = self.rfile.read(1) - if sym == '': - break - msg += sym.decode('utf-8') - if msg.endswith('\r\n'): - break - length = int(msg[:-2], 16) - if length == 0: - return '' - content = self.rfile.read(length) - self.rfile.read(2) # read sep \r\n - return content.decode('utf-8') - - def do_POST(self): - while True: - chunk = self.read_chunk() - if not chunk: - break - istream.write(chunk) - istream.flush() - text = "" - self._set_headers() - self.wfile.write("ok") - - def log_message(self, format, *args): - return - -def start_server(requests_amount, test_data="Hello,2,-2,7.7\nWorld,2,-5,8.8"): - ostream = StringIO(test_data.decode("utf-8")) - - httpd = HTTPServer(HTTP_SERVER_ADDRESS, EchoCSVHTTPServer) - - def real_func(): - for i in range(requests_amount): - httpd.handle_request() - - t = threading.Thread(target=real_func) - return t - -def run(requests_amount=1): - t = start_server(requests_amount) - t.start() - t.join() - -if __name__ == "__main__": - exception_text = '' - for i in range(1, 5): - try: - run(int(sys.argv[1]) if len(sys.argv) > 1 else 1) - break - except Exception as ex: - exception_text = str(ex) - time.sleep(1) - - if exception_text: - print("Exception: {}".format(exception_text), file=sys.stderr) - os._exit(1) - diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 3b92dc6c0645e055b02d8c513671786471baf5ee..3df639b73cf20deaa562502943db9be07095a834 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -45,6 +45,7 @@ "capnproto", "query_profiler", "memory_profiler", + "01175_distributed_ddl_output_mode_long", /// issue 21600 "01103_check_cpu_instructions_at_startup", "01086_odbc_roundtrip", /// can't pass because odbc libraries are not instrumented "00877_memory_limit_for_new_delete", /// memory limits don't work correctly under msan because it replaces malloc/free @@ -107,166 +108,50 @@ "00738_lock_for_inner_table" ], "database-replicated": [ - /// Tests with DETACH TABLE (it's not allowed) - /// and tests with SET (session and query settings are not supported) "memory_tracking", "memory_usage", "live_view", - "01761_alter_decimal_zookeeper", - "01560_optimize_on_insert_zookeeper", - "01720_type_map_and_casts", - "01413_alter_update_supertype", - "01149_zookeeper_mutation_stuck_after_replace_partition", - "00836_indices_alter_replicated_zookeeper", - "00652_mutations_alter_update", - "01715_tuple_insert_null_as_default", - "00825_protobuf_format_map", - "00152_insert_different_granularity", - "01715_background_checker_blather_zookeeper", - "01714_alter_drop_version", - "01114_materialize_clear_index_compact_parts", - "00814_replicated_minimalistic_part_header_zookeeper", - "01188_attach_table_from_pat", + "01181_db_atomic_drop_on_cluster", + "01175_distributed_ddl_output_mode", "01415_sticking_mutations", - "01130_in_memory_parts", - "01110_dictionary_layout_without_arguments", - "01018_ddl_dictionaries_create", - "01018_ddl_dictionaries_select", - "01414_freeze_does_not_prevent_alters", + "00980_zookeeper_merge_tree_alter_settings", + "01148_zookeeper_path_macros_unfolding", + "01294_system_distributed_on_cluster", + "01269_create_with_null", + /// grep -c "01018_ddl_dictionaries_bad_queries", - "01686_rocksdb", - "01550_mutation_subquery", - "01070_mutations_with_dependencies", - "01070_materialize_ttl", - "01055_compact_parts", - "01017_mutations_with_nondeterministic_functions_zookeeper", - "00926_adaptive_index_granularity_pk", - "00910_zookeeper_test_alter_compression_codecs", "00908_bloom_filter_index", - "00616_final_single_part", - "00446_clear_column_in_partition_zookeeper", - "01533_multiple_nested", - "01213_alter_rename_column_zookeeper", - "01575_disable_detach_table_of_dictionary", - "01457_create_as_table_function_structure", - "01415_inconsistent_merge_tree_settings", - "01413_allow_non_metadata_alters", - "01378_alter_rename_with_ttl_zookeeper", - "01349_mutation_datetime_key", - "01325_freeze_mutation_stuck", - "01272_suspicious_codecs", - "01181_db_atomic_drop_on_cluster", - "00957_delta_diff_bug", - "00910_zookeeper_custom_compression_codecs_replicated", - "00899_long_attach_memory_limit", - "00804_test_custom_compression_codes_log_storages", - "00804_test_alter_compression_codecs", - "00804_test_delta_codec_no_type_alter", - "00804_test_custom_compression_codecs", - "00753_alter_attach", - "00715_fetch_merged_or_mutated_part_zookeeper", - "00688_low_cardinality_serialization", - "01575_disable_detach_table_of_dictionary", - "00738_lock_for_inner_table", - "01666_blns", - "01652_ignore_and_low_cardinality", - "01651_map_functions", + /// Unsupported type of ALTER query "01650_fetch_patition_with_macro_in_zk_path", - "01648_mutations_and_escaping", - "01640_marks_corruption_regression", - "01622_byte_size", - "01611_string_to_low_cardinality_key_alter", - "01602_show_create_view", - "01600_log_queries_with_extensive_info", - "01560_ttl_remove_empty_parts", - "01554_bloom_filter_index_big_integer_uuid", - "01550_type_map_formats_input", - "01550_type_map_formats", - "01550_create_map_type", - "01532_primary_key_without_order_by_zookeeper", - "01511_alter_version_versioned_collapsing_merge_tree_zookeeper", - "01509_parallel_quorum_insert_no_replicas", - "01504_compression_multiple_streams", - "01494_storage_join_persistency", - "01493_storage_set_persistency", - "01493_alter_remove_properties_zookeeper", - "01475_read_subcolumns_storages", - "01475_read_subcolumns", - "01451_replicated_detach_drop_part", "01451_detach_drop_part", - "01440_big_int_exotic_casts", - "01430_modify_sample_by_zookeeper", - "01417_freeze_partition_verbose_zookeeper", + "01451_replicated_detach_drop_part", "01417_freeze_partition_verbose", - "01396_inactive_replica_cleanup_nodes_zookeeper", - "01375_compact_parts_codecs", - "01357_version_collapsing_attach_detach_zookeeper", - "01355_alter_column_with_order", - "01291_geo_types", - "01270_optimize_skip_unused_shards_low_cardinality", - "01182_materialized_view_different_structure", - "01150_ddl_guard_rwr", - "01148_zookeeper_path_macros_unfolding", - "01135_default_and_alter_zookeeper", + "01417_freeze_partition_verbose_zookeeper", "01130_in_memory_parts_partitons", - "01127_month_partitioning_consistency_select", - "01114_database_atomic", - "01083_expressions_in_engine_arguments", - "01073_attach_if_not_exists", - "01072_optimize_skip_unused_shards_const_expr_eval", - "01071_prohibition_secondary_index_with_old_format_merge_tree", - "01062_alter_on_mutataion_zookeeper", "01060_shutdown_table_after_detach", - "01056_create_table_as", - "01035_avg", "01021_only_tuple_columns", - "01019_alter_materialized_view_query", - "01019_alter_materialized_view_consistent", - "01019_alter_materialized_view_atomic", "01015_attach_part", - "00989_parallel_parts_loading", - "00980_zookeeper_merge_tree_alter_settings", - "00980_merge_alter_settings", "00955_test_final_mark", - "00933_reserved_word", - "00926_zookeeper_adaptive_index_granularity_replicated_merge_tree", - "00926_adaptive_index_granularity_replacing_merge_tree", - "00926_adaptive_index_granularity_merge_tree", + "00753_alter_attach", + "00626_replace_partition_from_table_zookeeper", + "00626_replace_partition_from_table", + "00152_insert_different_granularity", + /// Old syntax is not allowed + "01062_alter_on_mutataion_zookeeper", "00925_zookeeper_empty_replicated_merge_tree_optimize_final", - "00800_low_cardinality_distinct_numeric", "00754_alter_modify_order_by_replicated_zookeeper", - "00751_low_cardinality_nullable_group_by", - "00751_default_databasename_for_view", - "00719_parallel_ddl_table", - "00718_low_cardinaliry_alter", - "00717_low_cardinaliry_distributed_group_by", - "00688_low_cardinality_syntax", - "00688_low_cardinality_nullable_cast", - "00688_low_cardinality_in", "00652_replicated_mutations_zookeeper", - "00634_rename_view", - "00626_replace_partition_from_table", - "00625_arrays_in_nested", "00623_replicated_truncate_table_zookeeper", - "00619_union_highlite", - "00599_create_view_with_subquery", - "00571_non_exist_database_when_create_materializ_view", - "00553_buff_exists_materlized_column", "00516_deduplication_after_drop_partition_zookeeper", - "00508_materialized_view_to", "00446_clear_column_in_partition_concurrent_zookeeper", - "00423_storage_log_single_thread", - "00311_array_primary_key", "00236_replicated_drop_on_non_leader_zookeeper", "00226_zookeeper_deduplication_and_unexpected_parts", "00215_primary_key_order_zookeeper", - "00180_attach_materialized_view", "00121_drop_column_zookeeper", - "00116_storage_set", "00083_create_merge_tree_zookeeper", "00062_replicated_merge_tree_alter_zookeeper", - "01720_constraints_complex_types", - "01747_alter_partition_key_enum_zookeeper" + /// Does not support renaming of multiple tables in single query + "00634_rename_view" ], "polymorphic-parts": [ "01508_partition_pruning_long", /// bug, shoud be fixed @@ -753,6 +638,7 @@ "01601_detach_permanently", "01602_show_create_view", "01603_rename_overwrite_bug", + "01666_blns", "01646_system_restart_replicas_smoke", // system restart replicas is a global query "01656_test_query_log_factories_info", "01658_read_file_to_stringcolumn", diff --git a/utils/check-marks/main.cpp b/utils/check-marks/main.cpp index 2b244dcf0b6f55ab6a7c882858737798d1f42376..e9e0bbf11348a3128ca5dd1a232cd6f9083ea3df 100644 --- a/utils/check-marks/main.cpp +++ b/utils/check-marks/main.cpp @@ -19,7 +19,7 @@ static void checkByCompressedReadBuffer(const std::string & mrk_path, const std::string & bin_path) { DB::ReadBufferFromFile mrk_in(mrk_path); - DB::CompressedReadBufferFromFile bin_in(bin_path, 0, 0, 0); + DB::CompressedReadBufferFromFile bin_in(bin_path, 0, 0, 0, nullptr); DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO); bool mrk2_format = boost::algorithm::ends_with(mrk_path, ".mrk2");