Post-PR fixes

Renamed settings, updated docs.

Post-PR fixes
Renamed settings, updated docs.
c2fc71b5 · Vasily Nemkov · f98c4888 · c2fc71b5 · c2fc71b5 · c2fc71b5
5 changed file
--- a/dbms/src/Core/Settings.h
+++ b/dbms/src/Core/Settings.h
@@ -345,8 +345,8 @@ struct Settings : public SettingsCollection<Settings>
    M(SettingBool, check_query_single_value_result, true, "Return check query result as single 1/0 value") \
    M(SettingBool, allow_drop_detached, false, "Allow ALTER TABLE ... DROP DETACHED PART[ITION] ... queries") \
    \
-    M(SettingSeconds, replica_error_decrease_period, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD, "Time period reduces replica error counter by 2 times.") \
-    M(SettingUInt64, replica_error_max_count, DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT, "Max number of errors per replica, prevents piling up increadible amount of errors if replica was offline for some time and allows it to be reconsidered in a shorter amount of time.") \
+    M(SettingSeconds, distributed_replica_error_half_life, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD, "Time period reduces replica error counter by 2 times.") \
+    M(SettingUInt64, distributed_replica_error_cap, DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT, "Max number of errors per replica, prevents piling up increadible amount of errors if replica was offline for some time and allows it to be reconsidered in a shorter amount of time.") \
    \
    M(SettingBool, allow_experimental_live_view, false, "Enable LIVE VIEW. Not mature enough.") \
    M(SettingSeconds, live_view_heartbeat_interval, DEFAULT_LIVE_VIEW_HEARTBEAT_INTERVAL_SEC, "The heartbeat interval in seconds to indicate live query is alive.") \

--- a/dbms/src/Interpreters/Cluster.cpp
+++ b/dbms/src/Interpreters/Cluster.cpp
@@ -348,7 +348,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting

            ConnectionPoolWithFailoverPtr shard_pool = std::make_shared<ConnectionPoolWithFailover>(
                        all_replicas_pools, settings.load_balancing,
-                        settings.replica_error_decrease_period.totalSeconds(), settings.replica_error_max_count);
+                        settings.distributed_replica_error_half_life.totalSeconds(), settings.distributed_replica_error_cap);

            if (weight)
                slot_to_shard.insert(std::end(slot_to_shard), weight, shards_info.size());
@@ -400,7 +400,7 @@ Cluster::Cluster(const Settings & settings, const std::vector<std::vector<String

        ConnectionPoolWithFailoverPtr shard_pool = std::make_shared<ConnectionPoolWithFailover>(
                all_replicas, settings.load_balancing,
-                settings.replica_error_decrease_period.totalSeconds(), settings.replica_error_max_count);
+                settings.distributed_replica_error_half_life.totalSeconds(), settings.distributed_replica_error_cap);

        slot_to_shard.insert(std::end(slot_to_shard), default_weight, shards_info.size());
        shards_info.push_back({{}, current_shard_num, default_weight, std::move(shard_local_addresses), std::move(shard_pool),

--- a/dbms/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/dbms/src/Storages/Distributed/DirectoryMonitor.cpp
@@ -191,7 +191,7 @@ ConnectionPoolPtr StorageDistributedDirectoryMonitor::createPool(const std::stri

    const auto settings = storage.global_context.getSettings();
    return pools.size() == 1 ? pools.front() : std::make_shared<ConnectionPoolWithFailover>(pools, LoadBalancing::RANDOM,
-        settings.replica_error_decrease_period.totalSeconds(), settings.replica_error_max_count);
+        settings.distributed_replica_error_half_life.totalSeconds(), settings.distributed_replica_error_cap);
 }



--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -861,4 +861,29 @@ Possible values:

 Default value: 0.

+## distributed_replica_error_half_life {#settings-distributed_replica_error_half_life}
+
+- Type: seconds
+- Default value: 60 seconds
+
+Controls how fast errors of distributed tables are zeroed. Given that currently a replica was unavailabe for some time and accumulated 5 errors and distributed_replica_error_half_life is set to 1 second, then said replica is considered back to normal in 3 seconds since last error.
+
+** See also **
+
+- [Table engine Distributed](../../operations/table_engines/distributed.md)
+- [`distributed_replica_error_cap`](#settings-distributed_replica_error_cap)
+
+
+## distributed_replica_error_cap {#settings-distributed_replica_error_cap}
+
+- Type: unsigned int
+- Default value: 1000
+
+Error count of each replica is capped at this value, preventing a single replica from accumulating to many errors.
+
+** See also **
+
+- [Table engine Distributed](../../operations/table_engines/distributed.md)
+- [`distributed_replica_error_half_life`](#settings-distributed_replica_error_half_life)
+
 [Original article](https://clickhouse.yandex/docs/en/operations/settings/settings/) <!-- hide -->
--- a/docs/en/operations/system_tables.md
+++ b/docs/en/operations/system_tables.md
@@ -45,18 +45,28 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10
 ## system.clusters

 Contains information about clusters available in the config file and the servers in them.
+
 Columns:

-```
-cluster String — The cluster name.
-shard_num UInt32 — The shard number in the cluster, starting from 1.
-shard_weight UInt32 — The relative weight of the shard when writing data.
-replica_num UInt32 — The replica number in the shard, starting from 1.
-host_name String — The host name, as specified in the config.
-String host_address — The host IP address obtained from DNS.
-port UInt16 — The port to use for connecting to the server.
-user String — The name of the user for connecting to the server.
-```
+- `cluster` (String) — The cluster name.
+- `shard_num` (UInt32) — The shard number in the cluster, starting from 1.
+- `shard_weight` (UInt32) — The relative weight of the shard when writing data.
+- `replica_num` (UInt32) — The replica number in the shard, starting from 1.
+- `host_name` (String) — The host name, as specified in the config.
+- `host_address` (String) — The host IP address obtained from DNS.
+- `port` (UInt16) — The port to use for connecting to the server.
+- `user` (String) — The name of the user for connecting to the server.
+- `errors_count` (UInt32) - number of times this host failed to reach replica.
+- `estimated_recovery_time` (UInt32) - seconds left until replica error count is zeroed and it is considered to be back to normal.
+
+
+Please note that `errors_count` is updated once per query to the cluster, but `estimated_recovery_time` is recalculated on-demand. So there could be a case of non-zero `errors_count` and zero `estimated_recovery_time`, that next query will zero `errors_count` and try to use replica as if it has no errors.
+
+** See also **
+
+- [Table engine Distributed](../../operations/table_engines/distributed.md)
+- [distributed_replica_error_cap setting](../settings/settings.md#settings-distributed_replica_error_cap)
+- [distributed_replica_error_half_life setting](../settings/settings.md#settings-distributed_replica_error_half_life)

 ## system.columns