StorageReplicatedMergeTree.h 22.2 KB
Newer Older
M
Merge  
Michael Kolupaev 已提交
1 2
#pragma once

3
#include <ext/shared_ptr_helper.h>
4
#include <atomic>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
#include <Storages/IStorage.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeTreeDataMerger.h>
#include <Storages/MergeTree/MergeTreeDataWriter.h>
#include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
#include <Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
#include <Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h>
#include <Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h>
#include <Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h>
#include <Storages/MergeTree/ReplicatedMergeTreeAlterThread.h>
#include <Storages/MergeTree/AbandonableLockInZooKeeper.h>
#include <Storages/MergeTree/BackgroundProcessingPool.h>
#include <Storages/MergeTree/DataPartsExchange.h>
#include <Storages/MergeTree/RemoteDiskSpaceMonitor.h>
#include <Storages/MergeTree/ShardedPartitionUploader.h>
#include <Storages/MergeTree/RemoteQueryExecutor.h>
#include <Storages/MergeTree/RemotePartChecker.h>
#include <DataTypes/DataTypesNumber.h>
24 25
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/ZooKeeper/LeaderElection.h>
26

M
Merge  
Michael Kolupaev 已提交
27 28 29 30

namespace DB
{

F
f1yegor 已提交
31
/** The engine that uses the merge tree (see MergeTreeData) and replicated through ZooKeeper.
32
  *
F
f1yegor 已提交
33 34 35 36 37 38 39 40
  * ZooKeeper is used for the following things:
  * - the structure of the table (/ metadata, /columns)
  * - action log with data (/log/log-...,/replicas/replica_name/queue/queue-...);
  * - a replica list (/replicas), and replica activity tag (/replicas/replica_name/is_active), replica addresses (/replicas/replica_name/host);
  * - select the leader replica (/leader_election) - this is the replica that assigns the merge;
  * - a set of parts of data on each replica (/replicas/replica_name/parts);
  * - list of the last N blocks of data with checksum, for deduplication (/blocks);
  * - the list of incremental block numbers (/block_numbers) that we are about to insert,
41
  *   or that were unused (/nonincrement_block_numbers)
F
f1yegor 已提交
42 43
  *   to ensure the linear order of data insertion and data merge only on the intervals in this sequence;
  * - coordinates writes with quorum (/quorum).
M
Merge  
Michael Kolupaev 已提交
44
  */
45

F
f1yegor 已提交
46 47 48 49 50 51 52
/** The replicated tables have a common log (/log/log-...).
  * Log - a sequence of entries (LogEntry) about what to do.
  * Each entry is one of:
  * - normal data insertion (GET),
  * - merge (MERGE),
  * - slightly less common data insertion (ATTACH),
  * - delete the partition (DROP).
53
  *
F
f1yegor 已提交
54 55 56 57 58 59 60
  * Each replica copies (queueUpdatingThread, pullLogsToQueue) entries from the log to its queue (/replicas/replica_name/queue/queue-...)
  *  and then executes them (queueTask).
  * Despite the name of the "queue", execution can be reordered, if necessary (shouldExecuteLogEntry, executeLogEntry).
  * In addition, the records in the queue can be generated independently (not from the log), in the following cases:
  * - when creating a new replica, actions are put on GET from other replicas (createReplica);
  * - if the part is corrupt (removePartAndEnqueueFetch) or absent during the check (at start - checkParts, while running - searchForMissingPart),
  *   actions are put on GET from other replicas;
61
  *
F
f1yegor 已提交
62 63
  * The replica to which INSERT was made in the queue will also have an entry of the GET of this data.
  * Such an entry is considered to be executed as soon as the queue handler sees it.
64
  *
F
f1yegor 已提交
65 66
  * The log entry has a creation time. This time is generated by the clock of server that created entry
  * - the one on which the corresponding INSERT or ALTER query came.
67
  *
F
f1yegor 已提交
68 69
  * For the entries in the queue that the replica made for itself,
  * as the time will take the time of creation the appropriate part on any of the replicas.
70 71
  */

72
class StorageReplicatedMergeTree : public ext::shared_ptr_helper<StorageReplicatedMergeTree>, public IStorage
M
Merge  
Michael Kolupaev 已提交
73
{
74
friend class ext::shared_ptr_helper<StorageReplicatedMergeTree>;
75

M
Merge  
Michael Kolupaev 已提交
76
public:
77
    /** If not 'attach', either creates a new table in ZK, or adds a replica to an existing table.
78 79 80 81 82 83 84 85 86 87 88 89 90
      */
    static StoragePtr create(
        const String & zookeeper_path_,
        const String & replica_name_,
        bool attach,
        const String & path_, const String & database_name_, const String & name_,
        NamesAndTypesListPtr columns_,
        const NamesAndTypesList & materialized_columns_,
        const NamesAndTypesList & alias_columns_,
        const ColumnDefaults & column_defaults_,
        Context & context_,
        ASTPtr & primary_expr_ast_,
        const String & date_column_name_,
F
f1yegor 已提交
91
        const ASTPtr & sampling_expression_, /// nullptr, if sampling is not supported.
92 93 94 95 96
        size_t index_granularity_,
        const MergeTreeData::MergingParams & merging_params_,
        bool has_force_restore_data_flag,
        const MergeTreeSettings & settings_);

97
    void startup() override;
98 99 100 101 102 103 104 105 106 107 108 109 110
    void shutdown() override;
    ~StorageReplicatedMergeTree() override;

    std::string getName() const override
    {
        return "Replicated" + data.merging_params.getModeName() + "MergeTree";
    }

    std::string getTableName() const override { return table_name; }
    bool supportsSampling() const override { return data.supportsSampling(); }
    bool supportsFinal() const override { return data.supportsFinal(); }
    bool supportsPrewhere() const override { return data.supportsPrewhere(); }
    bool supportsParallelReplicas() const override { return true; }
111
    bool supportsReplication() const override { return true; }
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126

    const NamesAndTypesList & getColumnsListImpl() const override { return data.getColumnsListNonMaterialized(); }

    NameAndTypePair getColumn(const String & column_name) const override
    {
        return data.getColumn(column_name);
    }

    bool hasColumn(const String & column_name) const override
    {
        return data.hasColumn(column_name);
    }

    BlockInputStreams read(
        const Names & column_names,
127
        const ASTPtr & query,
128 129
        const Context & context,
        QueryProcessingStage::Enum & processed_stage,
130 131
        size_t max_block_size,
        unsigned num_streams) override;
132

A
Alexey Milovidov 已提交
133
    BlockOutputStreamPtr write(const ASTPtr & query, const Settings & settings) override;
134

135
    bool optimize(const String & partition, bool final, bool deduplicate, const Settings & settings) override;
136 137 138

    void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context) override;

139 140
    void dropPartition(const ASTPtr & query, const Field & partition, bool detach, const Settings & settings) override;
    void attachPartition(const ASTPtr & query, const Field & partition, bool part, const Settings & settings) override;
141 142 143
    void fetchPartition(const Field & partition, const String & from, const Settings & settings) override;
    void freezePartition(const Field & partition, const String & with_name, const Settings & settings) override;

A
Alexey Milovidov 已提交
144 145
    void reshardPartitions(
        const ASTPtr & query, const String & database_name,
146 147 148
        const Field & first_partition, const Field & last_partition,
        const WeightedZooKeeperPaths & weighted_zookeeper_paths,
        const ASTPtr & sharding_key_expr, bool do_copy, const Field & coordinator,
A
Alexey Milovidov 已提交
149
        Context & context) override;
150

F
f1yegor 已提交
151
    /** Removes a replica from ZooKeeper. If there are no other replicas, it deletes the entire table from ZooKeeper.
152 153 154 155 156 157 158 159 160 161 162 163 164
      */
    void drop() override;

    void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override;

    bool supportsIndexForIn() const override { return true; }

    bool checkTableCanBeDropped() const override;

    MergeTreeData & getData() { return data; }
    const MergeTreeData & getData() const { return data; }


F
f1yegor 已提交
165
    /** For the system table replicas. */
166 167 168 169 170 171 172 173 174 175 176 177 178
    struct Status
    {
        bool is_leader;
        bool is_readonly;
        bool is_session_expired;
        ReplicatedMergeTreeQueue::Status queue;
        UInt32 parts_to_check;
        String zookeeper_path;
        String replica_name;
        String replica_path;
        Int32 columns_version;
        UInt64 log_max_index;
        UInt64 log_pointer;
179
        UInt64 absolute_delay;
180 181 182 183
        UInt8 total_replicas;
        UInt8 active_replicas;
    };

F
f1yegor 已提交
184
    /// Get the status of the table. If with_zk_fields = false - do not fill in the fields that require queries to ZK.
185 186 187 188 189
    void getStatus(Status & res, bool with_zk_fields = true);

    using LogEntriesData = std::vector<ReplicatedMergeTreeLogEntryData>;
    void getQueue(LogEntriesData & res, String & replica_name);

190 191 192 193 194 195
    /// Get replica delay relative to current time.
    time_t getAbsoluteDelay() const;

    /// If the absolute delay is greater than min_relative_delay_to_yield_leadership,
    /// will also calculate the difference from the unprocessed time of the best replica.
    /// NOTE: Will communicate to ZooKeeper to calculate relative delay.
196 197
    void getReplicaDelays(time_t & out_absolute_delay, time_t & out_relative_delay);

F
f1yegor 已提交
198
    /// Add a part to the queue of parts whose data you want to check in the background thread.
199 200 201 202
    void enqueuePartForCheck(const String & part_name, time_t delay_to_check_seconds = 0)
    {
        part_check_thread.enqueuePart(part_name, delay_to_check_seconds);
    }
203

M
Merge  
Michael Kolupaev 已提交
204
private:
205 206 207
    /// Delete old chunks from disk and from ZooKeeper.
    void clearOldPartsAndRemoveFromZK(Logger * log_ = nullptr);

208 209 210 211 212 213 214 215
    friend class ReplicatedMergeTreeBlockOutputStream;
    friend class ReplicatedMergeTreeRestartingThread;
    friend class ReplicatedMergeTreePartCheckThread;
    friend class ReplicatedMergeTreeCleanupThread;
    friend class ReplicatedMergeTreeAlterThread;
    friend class ReplicatedMergeTreeRestartingThread;
    friend struct ReplicatedMergeTreeLogEntry;
    friend class ScopedPartitionMergeLock;
A
Merge  
Alexey Milovidov 已提交
216

217 218 219
    friend class ReshardingWorker;
    friend class ShardedPartitionUploader::Client;
    friend class ShardedPartitionUploader::Service;
M
Merge  
Michael Kolupaev 已提交
220

221 222
    using LogEntry = ReplicatedMergeTreeLogEntry;
    using LogEntryPtr = LogEntry::Ptr;
M
Merge  
Michael Kolupaev 已提交
223

224
    Context & context;
A
Merge  
Alexey Milovidov 已提交
225

F
f1yegor 已提交
226 227
    zkutil::ZooKeeperPtr current_zookeeper;        /// Use only the methods below.
    std::mutex current_zookeeper_mutex;            /// To recreate the session in the background thread.
A
Merge  
Alexey Milovidov 已提交
228

229 230 231
    zkutil::ZooKeeperPtr tryGetZooKeeper();
    zkutil::ZooKeeperPtr getZooKeeper();
    void setZooKeeper(zkutil::ZooKeeperPtr zookeeper);
M
Merge  
Michael Kolupaev 已提交
232

F
f1yegor 已提交
233
    /// If true, the table is offline and can not be written to it.
234 235 236 237 238 239 240 241 242 243
    bool is_readonly = false;

    String database_name;
    String table_name;
    String full_path;

    String zookeeper_path;
    String replica_name;
    String replica_path;

F
f1yegor 已提交
244 245
    /** The queue of what needs to be done on this replica to catch up with everyone. It is taken from ZooKeeper (/replicas/me/queue/).
      * In ZK entries in chronological order. Here it is not necessary.
246 247
      */
    ReplicatedMergeTreeQueue queue;
248 249
    std::atomic<time_t> last_queue_update_start_time{0};
    std::atomic<time_t> last_queue_update_finish_time{0};
250 251 252 253 254

    /** /replicas/me/is_active.
      */
    zkutil::EphemeralNodeHolderPtr replica_is_active_node;

F
f1yegor 已提交
255 256
    /** Version node /columns in ZooKeeper corresponding to the current data.columns.
      * Read and modify along with the data.columns - under TableStructureLock.
257 258 259
      */
    int columns_version = -1;

F
f1yegor 已提交
260
    /** Is this replica "master". The master replica selects the parts to merge.
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
      */
    bool is_leader_node = false;
    std::mutex leader_node_mutex;

    InterserverIOEndpointHolderPtr endpoint_holder;
    InterserverIOEndpointHolderPtr disk_space_monitor_endpoint_holder;
    InterserverIOEndpointHolderPtr sharded_partition_uploader_endpoint_holder;
    InterserverIOEndpointHolderPtr remote_query_executor_endpoint_holder;
    InterserverIOEndpointHolderPtr remote_part_checker_endpoint_holder;

    MergeTreeData data;
    MergeTreeDataSelectExecutor reader;
    MergeTreeDataWriter writer;
    MergeTreeDataMerger merger;

    DataPartsExchange::Fetcher fetcher;
    RemoteDiskSpaceMonitor::Client disk_space_monitor_client;
    ShardedPartitionUploader::Client sharded_partition_uploader_client;
    RemoteQueryExecutor::Client remote_query_executor_client;
    RemotePartChecker::Client remote_part_checker_client;

    zkutil::LeaderElectionPtr leader_election;

F
f1yegor 已提交
284
    /// Do I need to complete background threads (except restarting_thread)?
285 286 287 288 289 290
    std::atomic<bool> shutdown_called {false};
    Poco::Event shutdown_event;

    /// Limiting parallel fetches per one table
    std::atomic_uint current_table_fetches {0};

F
f1yegor 已提交
291
    /// Streams
292

F
f1yegor 已提交
293
    /// A thread that keeps track of the updates in the logs of all replicas and loads them into the queue.
294 295 296
    std::thread queue_updating_thread;
    zkutil::EventPtr queue_updating_event = std::make_shared<Poco::Event>();

F
f1yegor 已提交
297
    /// A task that performs actions from the queue.
298 299
    BackgroundProcessingPool::TaskHandle queue_task_handle;

F
f1yegor 已提交
300
    /// A thread that selects parts to merge.
301 302
    std::thread merge_selecting_thread;
    Poco::Event merge_selecting_event;
F
f1yegor 已提交
303
    std::mutex merge_selecting_mutex; /// It is taken for each iteration of the selection of parts to merge.
304

F
f1yegor 已提交
305
    /// A thread that removes old parts, log entries, and blocks.
306 307
    std::unique_ptr<ReplicatedMergeTreeCleanupThread> cleanup_thread;

F
f1yegor 已提交
308
    /// A thread that processes reconnection to ZooKeeper when the session expires.
309 310
    std::unique_ptr<ReplicatedMergeTreeRestartingThread> restarting_thread;

F
f1yegor 已提交
311
    /// A thread monitoring changes to the column list in ZooKeeper and updating the parts in accordance with these changes.
312 313
    std::unique_ptr<ReplicatedMergeTreeAlterThread> alter_thread;

F
f1yegor 已提交
314
    /// A thread that checks the data of the parts, as well as the queue of the parts to be checked.
315 316
    ReplicatedMergeTreePartCheckThread part_check_thread;

F
f1yegor 已提交
317
    /// An event that awakens `alter` method from waiting for the completion of the ALTER query.
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
    zkutil::EventPtr alter_query_event = std::make_shared<Poco::Event>();

    Logger * log;

    StorageReplicatedMergeTree(
        const String & zookeeper_path_,
        const String & replica_name_,
        bool attach,
        const String & path_, const String & database_name_, const String & name_,
        NamesAndTypesListPtr columns_,
        const NamesAndTypesList & materialized_columns_,
        const NamesAndTypesList & alias_columns_,
        const ColumnDefaults & column_defaults_,
        Context & context_,
        ASTPtr & primary_expr_ast_,
        const String & date_column_name_,
        const ASTPtr & sampling_expression_,
        size_t index_granularity_,
        const MergeTreeData::MergingParams & merging_params_,
        bool has_force_restore_data_flag,
        const MergeTreeSettings & settings_);

F
f1yegor 已提交
340
    /// Initialization.
341

F
f1yegor 已提交
342
    /** Creates the minimum set of nodes in ZooKeeper.
343 344 345
      */
    void createTableIfNotExists();

F
f1yegor 已提交
346
    /** Creates a replica in ZooKeeper and adds to the queue all that it takes to catch up with the rest of the replicas.
347 348 349
      */
    void createReplica();

F
f1yegor 已提交
350
    /** Create nodes in the ZK, which must always be, but which might not exist when older versions of the server are running.
351 352 353
      */
    void createNewZooKeeperNodes();

F
f1yegor 已提交
354 355
    /** Verify that the list of columns and table settings match those specified in ZK (/metadata).
      * If not, throw an exception.
356 357 358
      */
    void checkTableStructure(bool skip_sanity_checks, bool allow_alter);

F
f1yegor 已提交
359 360 361 362
    /** Check that the set of parts corresponds to that in ZK (/replicas/me/parts/).
      * If any parts described in ZK are not locally, throw an exception.
      * If any local parts are not mentioned in ZK, remove them.
      *  But if there are too many, throw an exception just in case - it's probably a configuration error.
363 364 365
      */
    void checkParts(bool skip_sanity_checks);

F
f1yegor 已提交
366 367 368 369 370
    /** Check that the part's checksum is the same as the checksum of the same part on some other replica.
      * If no one has such a part, nothing checks.
      * Not very reliable: if two replicas add a part almost at the same time, no checks will occur.
      * Adds actions to `ops` that add data about the part into ZooKeeper.
      * Call under TableStructureLock.
371 372 373
      */
    void checkPartAndAddToZooKeeper(const MergeTreeData::DataPartPtr & part, zkutil::Ops & ops, String name_override = "");

F
f1yegor 已提交
374 375 376
    /** Based on the assumption that there is no such part anywhere else (This is provided if the part number is highlighted with AbandonableLock).
      * Adds actions to `ops` that add data about the part into ZooKeeper.
      * Call under TableStructureLock.
377 378 379
      */
    void addNewPartToZooKeeper(const MergeTreeData::DataPartPtr & part, zkutil::Ops & ops, String name_override = "");

F
f1yegor 已提交
380
    /// Adds actions to `ops` that remove a part from ZooKeeper.
381 382
    void removePartFromZooKeeper(const String & part_name, zkutil::Ops & ops);

383
    /// Like removePartFromZooKeeper, but handles absence of some nodes and remove other nodes anyway, see CLICKHOUSE-3040
384 385 386
    /// Use it only in non-critical places for cleaning.
    void removePossiblyIncompletePartNodeFromZooKeeper(const String & part_name, zkutil::Ops & ops, const zkutil::ZooKeeperPtr & zookeeper);

F
f1yegor 已提交
387
    /// Removes a part from ZooKeeper and adds a task to the queue to download it. It is supposed to do this with broken parts.
388 389
    void removePartAndEnqueueFetch(const String & part_name);

F
f1yegor 已提交
390
    /// Running jobs from the queue.
391

F
f1yegor 已提交
392 393
    /** Copies the new entries from the logs of all replicas to the queue of this replica.
      * If next_update_event != nullptr, calls this event when new entries appear in the log.
394 395 396
      */
    void pullLogsToQueue(zkutil::EventPtr next_update_event = nullptr);

F
f1yegor 已提交
397 398
    /** Execute the action from the queue. Throws an exception if something is wrong.
      * Returns whether or not it succeeds. If it did not work, write it to the end of the queue.
399 400 401 402
      */
    bool executeLogEntry(const LogEntry & entry);

    void executeDropRange(const LogEntry & entry);
F
f1yegor 已提交
403
    bool executeAttachPart(const LogEntry & entry); /// Returns false if the part is absent, and it needs to be picked up from another replica.
404

F
f1yegor 已提交
405
    /** Updates the queue.
406 407 408
      */
    void queueUpdatingThread();

F
f1yegor 已提交
409
    /** Performs actions from the queue.
410 411 412
      */
    bool queueTask();

F
f1yegor 已提交
413
    /// Select the parts to merge.
414 415 416

    void becomeLeader();

F
f1yegor 已提交
417
    /** Selects the parts to merge and writes to the log.
418 419 420 421
      */
    void mergeSelectingThread();

    using MemoizedPartsThatCouldBeMerged = std::set<std::pair<std::string, std::string>>;
F
f1yegor 已提交
422
    /// Is it possible to merge parts in the specified range? `memo` is an optional parameter.
423 424 425 426 427
    bool canMergeParts(
        const MergeTreeData::DataPartPtr & left,
        const MergeTreeData::DataPartPtr & right,
        MemoizedPartsThatCouldBeMerged * memo);

F
f1yegor 已提交
428 429 430
    /** Write the selected parts to merge into the log,
      * Call when merge_selecting_mutex is locked.
      * Returns false if any part is not in ZK.
431 432 433 434
      */
    bool createLogEntryToMergeParts(
        const MergeTreeData::DataPartsVector & parts,
        const String & merged_name,
Y
Yuri Dyachenko 已提交
435
        bool deduplicate,
436 437
        ReplicatedMergeTreeLogEntryData * out_log_entry = nullptr);

F
f1yegor 已提交
438
    /// Exchange parts.
439

F
f1yegor 已提交
440
    /** Returns an empty string if no one has a part.
441 442 443 444 445
      */
    String findReplicaHavingPart(const String & part_name, bool active);

    /** Find replica having specified part or any part that covers it.
      * If active = true, consider only active replicas.
446
      * If found, returns replica name and set 'entry->actual_new_part_name' to name of found largest covering part.
447 448
      * If not found, returns empty string.
      */
449
    String findReplicaHavingCoveringPart(const LogEntry & entry, bool active);
450

F
f1yegor 已提交
451 452 453
    /** Download the specified part from the specified replica.
      * If `to_detached`, the part is placed in the `detached` directory.
      * If quorum != 0, then the node for tracking the quorum is updated.
454 455 456 457
      * Returns false if part is already fetching right now.
      */
    bool fetchPart(const String & part_name, const String & replica_path, bool to_detached, size_t quorum);

458
    /// Required only to avoid races between executeLogEntry and fetchPartition
459 460 461
    std::unordered_set<String> currently_fetching_parts;
    std::mutex currently_fetching_parts_mutex;

462
    /// With the quorum being tracked, add a replica to the quorum for the part.
463 464 465 466
    void updateQuorum(const String & part_name);

    AbandonableLockInZooKeeper allocateBlockNumber(const String & month_name);

F
f1yegor 已提交
467 468
    /** Wait until all replicas, including this, execute the specified action from the log.
      * If replicas are added at the same time, it can not wait the added replica .
469 470 471
      */
    void waitForAllReplicasToProcessLogEntry(const ReplicatedMergeTreeLogEntryData & entry);

F
f1yegor 已提交
472
    /** Wait until the specified replica executes the specified action from the log.
473 474 475
      */
    void waitForReplicaToProcessLogEntry(const String & replica_name, const ReplicatedMergeTreeLogEntryData & entry);

F
f1yegor 已提交
476
    /// Throw an exception if the table is readonly.
477
    void assertNotReadonly() const;
A
Merge  
Alexey Arno 已提交
478

F
f1yegor 已提交
479 480
    /** Get a lock that protects the specified partition from the merge task.
      * The lock is recursive.
481 482
      */
    std::string acquirePartitionMergeLock(const std::string & partition_name);
A
Merge  
Alexey Milovidov 已提交
483

F
f1yegor 已提交
484 485
    /** Declare that we no longer refer to the lock corresponding to the specified
      * partition. If there are no more links, the lock is destroyed.
486 487 488 489
      */
    void releasePartitionMergeLock(const std::string & partition_name);


F
f1yegor 已提交
490
    /// Check for a node in ZK. If it is, remember this information, and then immediately answer true.
491 492 493 494 495
    std::unordered_set<std::string> existing_nodes_cache;
    std::mutex existing_nodes_cache_mutex;
    bool existsNodeCached(const std::string & path);


F
f1yegor 已提交
496
    /// Resharding.
497 498 499 500 501
    struct ReplicaSpaceInfo
    {
        long double factor = 0.0;
        size_t available_size = 0;
    };
A
Merge  
Alexey Milovidov 已提交
502

503 504
    using ReplicaToSpaceInfo = std::map<std::string, ReplicaSpaceInfo>;

F
f1yegor 已提交
505
    /** Checks that the structures of the local and replicated tables are the same.
506 507 508
      */
    void enforceShardsConsistency(const WeightedZooKeeperPaths & weighted_zookeeper_paths);

F
f1yegor 已提交
509 510
    /** Get information about free space on replicas + additional information
      * for the function checkSpaceForResharding.
511 512 513
      */
    ReplicaToSpaceInfo gatherReplicaSpaceInfo(const WeightedZooKeeperPaths & weighted_zookeeper_paths);

F
f1yegor 已提交
514
    /** Checks that there is enough free space locally and on all replicas.
515 516
      */
    bool checkSpaceForResharding(const ReplicaToSpaceInfo & replica_to_space_info, size_t partition_size) const;
A
Merge  
Alexey Arno 已提交
517 518
};

519

520
extern const Int64 RESERVED_BLOCK_NUMBERS;
521 522
extern const int MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER;

M
Merge  
Michael Kolupaev 已提交
523
}