column_family.cc 30.9 KB
Newer Older
1 2 3 4 5 6 7 8 9
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

I
Igor Canadi 已提交
10
#include "db/column_family.h"
11

12 13 14 15 16
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#include <inttypes.h>
17 18 19
#include <vector>
#include <string>
#include <algorithm>
I
Igor Canadi 已提交
20
#include <limits>
21

I
Igor Canadi 已提交
22
#include "db/compaction_picker.h"
23
#include "db/db_impl.h"
24 25 26
#include "db/job_context.h"
#include "db/version_set.h"
#include "db/writebuffer.h"
27
#include "db/internal_stats.h"
I
Igor Canadi 已提交
28
#include "db/job_context.h"
29
#include "db/table_properties_collector.h"
I
Igor Canadi 已提交
30
#include "db/version_set.h"
31
#include "db/write_controller.h"
32
#include "util/autovector.h"
33
#include "util/hash_skiplist_rep.h"
34
#include "util/options_helper.h"
35
#include "util/xfunc.h"
I
Igor Canadi 已提交
36 37 38

namespace rocksdb {

39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
namespace {
// This function computes the amount of time in microseconds by which a write
// should be delayed based on the number of level-0 files according to the
// following formula:
// if n < bottom, return 0;
// if n >= top, return 1000;
// otherwise, let r = (n - bottom) /
//                    (top - bottom)
//  and return r^2 * 1000.
// The goal of this formula is to gradually increase the rate at which writes
// are slowed. We also tried linear delay (r * 1000), but it seemed to do
// slightly worse. There is no other particular reason for choosing quadratic.
uint64_t SlowdownAmount(int n, double bottom, double top) {
  uint64_t delay;
  if (n >= top) {
    delay = 1000;
  } else if (n < bottom) {
    delay = 0;
  } else {
    // If we are here, we know that:
    //   level0_start_slowdown <= n < level0_slowdown
    // since the previous two conditions are false.
    double how_much = static_cast<double>(n - bottom) / (top - bottom);
    delay = std::max(how_much * how_much * 1000, 100.0);
  }
  assert(delay <= 1000);
  return delay;
}
}  // namespace

I
Igor Canadi 已提交
69
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
70
    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
I
Igor Canadi 已提交
71
    : cfd_(column_family_data), db_(db), mutex_(mutex) {
72 73 74 75 76 77 78
  if (cfd_ != nullptr) {
    cfd_->Ref();
  }
}

ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
  if (cfd_ != nullptr) {
79 80 81
    // Job id == 0 means that this is not our background process, but rather
    // user thread
    JobContext job_context(0);
82 83 84 85
    mutex_->Lock();
    if (cfd_->Unref()) {
      delete cfd_;
    }
I
Igor Canadi 已提交
86
    db_->FindObsoleteFiles(&job_context, false, true);
87
    mutex_->Unlock();
I
Igor Canadi 已提交
88 89
    if (job_context.HaveSomethingToDelete()) {
      db_->PurgeObsoleteFiles(job_context);
90
    }
91 92 93
  }
}

94 95
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }

96 97 98 99
const std::string& ColumnFamilyHandleImpl::GetName() const {
  return cfd()->GetName();
}

100 101 102 103
const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
  return cfd()->user_comparator();
}

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
void GetIntTblPropCollectorFactory(
    const ColumnFamilyOptions& cf_options,
    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories) {
  auto& collector_factories = cf_options.table_properties_collector_factories;
  for (size_t i = 0; i < cf_options.table_properties_collector_factories.size();
       ++i) {
    assert(collector_factories[i]);
    int_tbl_prop_collector_factories->emplace_back(
        new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
  }
  // Add collector to collect internal key statistics
  int_tbl_prop_collector_factories->emplace_back(
      new InternalKeyPropertiesCollectorFactory);
}

120 121 122
ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
                                    const InternalKeyComparator* icmp,
                                    const ColumnFamilyOptions& src) {
123 124
  ColumnFamilyOptions result = src;
  result.comparator = icmp;
I
Igor Canadi 已提交
125 126 127 128
#ifdef OS_MACOSX
  // TODO(icanadi) make write_buffer_size uint64_t instead of size_t
  ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
#else
129 130
  ClipToRange(&result.write_buffer_size,
              ((size_t)64) << 10, ((size_t)64) << 30);
I
Igor Canadi 已提交
131
#endif
132 133 134 135 136 137 138 139
  // if user sets arena_block_size, we trust user to use this value. Otherwise,
  // calculate a proper value from writer_buffer_size;
  if (result.arena_block_size <= 0) {
    result.arena_block_size = result.write_buffer_size / 10;
  }
  result.min_write_buffer_number_to_merge =
      std::min(result.min_write_buffer_number_to_merge,
               result.max_write_buffer_number - 1);
140 141 142 143 144 145 146
  if (result.num_levels < 1) {
    result.num_levels = 1;
  }
  if (result.compaction_style == kCompactionStyleLevel &&
      result.num_levels < 2) {
    result.num_levels = 2;
  }
147 148 149 150 151 152
  if (result.max_mem_compaction_level >= result.num_levels) {
    result.max_mem_compaction_level = result.num_levels - 1;
  }
  if (result.soft_rate_limit > result.hard_rate_limit) {
    result.soft_rate_limit = result.hard_rate_limit;
  }
153 154 155
  if (result.max_write_buffer_number < 2) {
    result.max_write_buffer_number = 2;
  }
156 157 158 159 160 161 162 163 164 165 166 167
  if (result.max_write_buffer_number_to_maintain < 0) {
    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
  }
  XFUNC_TEST("memtablelist_history", "transaction_xftest_SanitizeOptions",
             xf_transaction_set_memtable_history1,
             xf_transaction_set_memtable_history,
             &result.max_write_buffer_number_to_maintain);
  XFUNC_TEST("memtablelist_history_clear", "transaction_xftest_SanitizeOptions",
             xf_transaction_clear_memtable_history1,
             xf_transaction_clear_memtable_history,
             &result.max_write_buffer_number_to_maintain);

168 169 170 171 172
  if (!result.prefix_extractor) {
    assert(result.memtable_factory);
    Slice name = result.memtable_factory->Name();
    if (name.compare("HashSkipListRepFactory") == 0 ||
        name.compare("HashLinkListRepFactory") == 0) {
173 174 175 176
      result.memtable_factory = std::make_shared<SkipListFactory>();
    }
  }

I
Igor Canadi 已提交
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
  if (!src.compression_per_level.empty()) {
    for (size_t level = 0; level < src.compression_per_level.size(); ++level) {
      if (!CompressionTypeSupported(src.compression_per_level[level])) {
        Log(InfoLogLevel::WARN_LEVEL, db_options.info_log,
            "Compression type chosen for level %zu is not supported: %s. "
            "RocksDB "
            "will not compress data on level %zu.",
            level, CompressionTypeToString(src.compression_per_level[level]),
            level);
      }
    }
  } else if (!CompressionTypeSupported(src.compression)) {
    Log(InfoLogLevel::WARN_LEVEL, db_options.info_log,
        "Compression type chosen is not supported: %s. RocksDB will not "
        "compress data.",
        CompressionTypeToString(src.compression));
  }

I
Igor Canadi 已提交
195 196 197 198 199 200 201 202 203
  if (result.compaction_style == kCompactionStyleFIFO) {
    result.num_levels = 1;
    // since we delete level0 files in FIFO compaction when there are too many
    // of them, these options don't really mean anything
    result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
    result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
    result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
  }

204 205 206 207
  if (result.level0_stop_writes_trigger <
          result.level0_slowdown_writes_trigger ||
      result.level0_slowdown_writes_trigger <
          result.level0_file_num_compaction_trigger) {
208
    Warn(db_options.info_log.get(),
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
         "This condition must be satisfied: "
         "level0_stop_writes_trigger(%d) >= "
         "level0_slowdown_writes_trigger(%d) >= "
         "level0_file_num_compaction_trigger(%d)",
         result.level0_stop_writes_trigger,
         result.level0_slowdown_writes_trigger,
         result.level0_file_num_compaction_trigger);
    if (result.level0_slowdown_writes_trigger <
        result.level0_file_num_compaction_trigger) {
      result.level0_slowdown_writes_trigger =
          result.level0_file_num_compaction_trigger;
    }
    if (result.level0_stop_writes_trigger <
        result.level0_slowdown_writes_trigger) {
      result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
    }
225
    Warn(db_options.info_log.get(),
226 227 228 229 230 231 232 233
         "Adjust the value to "
         "level0_stop_writes_trigger(%d)"
         "level0_slowdown_writes_trigger(%d)"
         "level0_file_num_compaction_trigger(%d)",
         result.level0_stop_writes_trigger,
         result.level0_slowdown_writes_trigger,
         result.level0_file_num_compaction_trigger);
  }
234 235 236 237 238 239 240 241 242 243
  if (result.level_compaction_dynamic_level_bytes) {
    if (result.compaction_style != kCompactionStyleLevel ||
        db_options.db_paths.size() > 1U) {
      // 1. level_compaction_dynamic_level_bytes only makes sense for
      //    level-based compaction.
      // 2. we don't yet know how to make both of this feature and multiple
      //    DB path work.
      result.level_compaction_dynamic_level_bytes = false;
    }
  }
244

245 246 247
  return result;
}

248 249 250
int SuperVersion::dummy = 0;
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
void* const SuperVersion::kSVObsolete = nullptr;
251

252 253 254 255 256 257 258 259 260 261 262 263 264
SuperVersion::~SuperVersion() {
  for (auto td : to_delete) {
    delete td;
  }
}

SuperVersion* SuperVersion::Ref() {
  refs.fetch_add(1, std::memory_order_relaxed);
  return this;
}

bool SuperVersion::Unref() {
  // fetch_sub returns the previous value of ref
265
  uint32_t previous_refs = refs.fetch_sub(1);
266 267
  assert(previous_refs > 0);
  return previous_refs == 1;
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
}

void SuperVersion::Cleanup() {
  assert(refs.load(std::memory_order_relaxed) == 0);
  imm->Unref(&to_delete);
  MemTable* m = mem->Unref();
  if (m != nullptr) {
    to_delete.push_back(m);
  }
  current->Unref();
}

void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
                        Version* new_current) {
  mem = new_mem;
  imm = new_imm;
  current = new_current;
  mem->Ref();
  imm->Ref();
  current->Ref();
  refs.store(1, std::memory_order_relaxed);
}

291 292
namespace {
void SuperVersionUnrefHandle(void* ptr) {
293 294 295 296
  // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
  // destroyed. When former happens, the thread shouldn't see kSVInUse.
  // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
  // well.
297 298 299 300 301 302 303 304 305 306
  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
  if (sv->Unref()) {
    sv->db_mutex->Lock();
    sv->Cleanup();
    sv->db_mutex->Unlock();
    delete sv;
  }
}
}  // anonymous namespace

307 308 309 310 311
ColumnFamilyData::ColumnFamilyData(
    uint32_t id, const std::string& name, Version* _dummy_versions,
    Cache* _table_cache, WriteBuffer* write_buffer,
    const ColumnFamilyOptions& cf_options, const DBOptions* db_options,
    const EnvOptions& env_options, ColumnFamilySet* column_family_set)
312 313
    : id_(id),
      name_(name),
I
Igor Canadi 已提交
314
      dummy_versions_(_dummy_versions),
315
      current_(nullptr),
316 317
      refs_(0),
      dropped_(false),
318
      internal_comparator_(cf_options.comparator),
319 320
      options_(*db_options,
               SanitizeOptions(*db_options, &internal_comparator_, cf_options)),
L
Lei Jin 已提交
321
      ioptions_(options_),
322
      mutable_cf_options_(options_, ioptions_),
323
      write_buffer_(write_buffer),
324
      mem_(nullptr),
325 326
      imm_(options_.min_write_buffer_number_to_merge,
           options_.max_write_buffer_number_to_maintain),
327 328
      super_version_(nullptr),
      super_version_number_(0),
329
      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
330 331
      next_(nullptr),
      prev_(nullptr),
332
      log_number_(0),
333 334 335
      column_family_set_(column_family_set),
      pending_flush_(false),
      pending_compaction_(false) {
336 337
  Ref();

338 339 340
  // Convert user defined table properties collector factories to internal ones.
  GetIntTblPropCollectorFactory(options_, &int_tbl_prop_collector_factories_);

I
Igor Canadi 已提交
341 342
  // if _dummy_versions is nullptr, then this is a dummy column family.
  if (_dummy_versions != nullptr) {
343
    internal_stats_.reset(
344
        new InternalStats(ioptions_.num_levels, db_options->env, this));
I
Igor Canadi 已提交
345
    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
346
    if (ioptions_.compaction_style == kCompactionStyleLevel) {
I
Igor Canadi 已提交
347
      compaction_picker_.reset(
348
          new LevelCompactionPicker(ioptions_, &internal_comparator_));
349 350 351 352
#ifndef ROCKSDB_LITE
    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
      compaction_picker_.reset(
          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
353
    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
I
Igor Canadi 已提交
354
      compaction_picker_.reset(
355
          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
356 357 358 359 360 361 362
    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
      compaction_picker_.reset(new NullCompactionPicker(
          ioptions_, &internal_comparator_));
      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
          "Column family %s does not use any background compaction. "
          "Compactions can only be done via CompactFiles\n",
          GetName().c_str());
363
#endif  // !ROCKSDB_LITE
364 365 366 367 368 369 370
    } else {
      Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
          "Unable to recognize the specified compaction style %d. "
          "Column family %s will use kCompactionStyleLevel.\n",
          ioptions_.compaction_style, GetName().c_str());
      compaction_picker_.reset(
          new LevelCompactionPicker(ioptions_, &internal_comparator_));
371
    }
372

373 374 375 376 377 378 379 380
    if (column_family_set_->NumberOfColumnFamilies() < 10) {
      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
          "--------------- Options for column family [%s]:\n", name.c_str());
      options_.Dump(ioptions_.info_log);
    } else {
      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
          "\t(skipping printing options)\n");
    }
381
  }
382

383
  RecalculateWriteStallConditions(mutable_cf_options_);
384
}
I
Igor Canadi 已提交
385

386
// DB mutex held
I
Igor Canadi 已提交
387
ColumnFamilyData::~ColumnFamilyData() {
I
Igor Canadi 已提交
388
  assert(refs_.load(std::memory_order_relaxed) == 0);
389 390 391 392 393 394
  // remove from linked list
  auto prev = prev_;
  auto next = next_;
  prev->next_ = next;
  next->prev_ = prev;

I
Igor Canadi 已提交
395 396 397 398
  if (!dropped_ && column_family_set_ != nullptr) {
    // If it's dropped, it's already removed from column family set
    // If column_family_set_ == nullptr, this is dummy CFD and not in
    // ColumnFamilySet
I
Igor Canadi 已提交
399
    column_family_set_->RemoveColumnFamily(this);
400 401 402 403 404 405
  }

  if (current_ != nullptr) {
    current_->Unref();
  }

406 407 408 409 410
  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
  // compaction_queue_ and we destroyed it
  assert(!pending_flush_);
  assert(!pending_compaction_);

I
Igor Canadi 已提交
411 412 413 414 415 416 417 418 419 420 421 422 423 424
  if (super_version_ != nullptr) {
    // Release SuperVersion reference kept in ThreadLocalPtr.
    // This must be done outside of mutex_ since unref handler can lock mutex.
    super_version_->db_mutex->Unlock();
    local_sv_.reset();
    super_version_->db_mutex->Lock();

    bool is_last_reference __attribute__((unused));
    is_last_reference = super_version_->Unref();
    assert(is_last_reference);
    super_version_->Cleanup();
    delete super_version_;
    super_version_ = nullptr;
  }
425

426 427
  if (dummy_versions_ != nullptr) {
    // List must be empty
428 429 430
    assert(dummy_versions_->TEST_Next() == dummy_versions_);
    bool deleted __attribute__((unused)) = dummy_versions_->Unref();
    assert(deleted);
431
  }
432

433 434
  if (mem_ != nullptr) {
    delete mem_->Unref();
435
  }
436
  autovector<MemTable*> to_delete;
437
  imm_.current()->Unref(&to_delete);
438 439 440 441 442
  for (MemTable* m : to_delete) {
    delete m;
  }
}

I
Igor Canadi 已提交
443 444 445 446 447 448 449 450 451 452
void ColumnFamilyData::SetDropped() {
  // can't drop default CF
  assert(id_ != 0);
  dropped_ = true;
  write_controller_token_.reset();

  // remove from column_family_set
  column_family_set_->RemoveColumnFamily(this);
}

453 454
void ColumnFamilyData::RecalculateWriteStallConditions(
      const MutableCFOptions& mutable_cf_options) {
455
  if (current_ != nullptr) {
S
sdong 已提交
456
    auto* vstorage = current_->storage_info();
457 458
    const double score = vstorage->max_compaction_score();
    const int max_level = vstorage->max_compaction_score_level();
459 460 461

    auto write_controller = column_family_set_->write_controller_;

462
    if (imm()->NumNotFlushed() >= mutable_cf_options.max_write_buffer_number) {
463 464
      write_controller_token_ = write_controller->GetStopToken();
      internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
465
      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
466
          "[%s] Stopping writes because we have %d immutable memtables "
L
Lei Jin 已提交
467
          "(waiting for flush), max_write_buffer_number is set to %d",
468
          name_.c_str(), imm()->NumNotFlushed(),
L
Lei Jin 已提交
469
          mutable_cf_options.max_write_buffer_number);
S
sdong 已提交
470
    } else if (vstorage->l0_delay_trigger_count() >=
471
               mutable_cf_options.level0_stop_writes_trigger) {
472 473
      write_controller_token_ = write_controller->GetStopToken();
      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
474
      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
475
          "[%s] Stopping writes because we have %d level-0 files",
S
sdong 已提交
476
          name_.c_str(), vstorage->l0_delay_trigger_count());
477
    } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
S
sdong 已提交
478
               vstorage->l0_delay_trigger_count() >=
479
                   mutable_cf_options.level0_slowdown_writes_trigger) {
S
sdong 已提交
480
      uint64_t slowdown =
S
sdong 已提交
481
          SlowdownAmount(vstorage->l0_delay_trigger_count(),
S
sdong 已提交
482 483
                         mutable_cf_options.level0_slowdown_writes_trigger,
                         mutable_cf_options.level0_stop_writes_trigger);
484 485
      write_controller_token_ = write_controller->GetDelayToken(slowdown);
      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
486
      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
487 488
          "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
          "us)",
S
sdong 已提交
489
          name_.c_str(), vstorage->l0_delay_trigger_count(), slowdown);
490 491
    } else if (mutable_cf_options.hard_rate_limit > 1.0 &&
               score > mutable_cf_options.hard_rate_limit) {
492 493 494
      uint64_t kHardLimitSlowdown = 1000;
      write_controller_token_ =
          write_controller->GetDelayToken(kHardLimitSlowdown);
495
      internal_stats_->RecordLevelNSlowdown(max_level, false);
496
      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
497 498 499
          "[%s] Stalling writes because we hit hard limit on level %d. "
          "(%" PRIu64 "us)",
          name_.c_str(), max_level, kHardLimitSlowdown);
500 501 502 503 504
    } else if (mutable_cf_options.soft_rate_limit > 0.0 &&
               score > mutable_cf_options.soft_rate_limit) {
      uint64_t slowdown = SlowdownAmount(score,
          mutable_cf_options.soft_rate_limit,
          mutable_cf_options.hard_rate_limit);
505
      write_controller_token_ = write_controller->GetDelayToken(slowdown);
506
      internal_stats_->RecordLevelNSlowdown(max_level, true);
507
      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
508 509 510 511 512 513
          "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
          "us)",
          name_.c_str(), max_level, slowdown);
    } else {
      write_controller_token_.reset();
    }
514 515 516
  }
}

L
Lei Jin 已提交
517
const EnvOptions* ColumnFamilyData::soptions() const {
L
Lei Jin 已提交
518
  return &(column_family_set_->env_options_);
L
Lei Jin 已提交
519 520
}

I
Igor Canadi 已提交
521 522 523
void ColumnFamilyData::SetCurrent(Version* current_version) {
  current_ = current_version;
}
524

525 526 527 528
uint64_t ColumnFamilyData::GetNumLiveVersions() const {
  return VersionSet::GetNumLiveVersions(dummy_versions_);
}

529
MemTable* ColumnFamilyData::ConstructNewMemtable(
A
agiardullo 已提交
530
    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
531
  assert(current_ != nullptr);
A
agiardullo 已提交
532 533
  return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
                      write_buffer_, earliest_seq);
534 535 536
}

void ColumnFamilyData::CreateNewMemtable(
A
agiardullo 已提交
537
    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
538 539
  if (mem_ != nullptr) {
    delete mem_->Unref();
540
  }
A
agiardullo 已提交
541
  SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
542 543 544
  mem_->Ref();
}

545 546 547 548
bool ColumnFamilyData::NeedsCompaction() const {
  return compaction_picker_->NeedsCompaction(current_->storage_info());
}

549 550
Compaction* ColumnFamilyData::PickCompaction(
    const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
S
sdong 已提交
551
  auto* result = compaction_picker_->PickCompaction(
S
sdong 已提交
552
      GetName(), mutable_options, current_->storage_info(), log_buffer);
S
sdong 已提交
553 554 555
  if (result != nullptr) {
    result->SetInputVersion(current_);
  }
556
  return result;
557 558
}

559
const int ColumnFamilyData::kCompactAllLevels = -1;
560
const int ColumnFamilyData::kCompactToBaseLevel = -2;
561

562 563 564 565 566
Compaction* ColumnFamilyData::CompactRange(
    const MutableCFOptions& mutable_cf_options,
    int input_level, int output_level, uint32_t output_path_id,
    const InternalKey* begin, const InternalKey* end,
    InternalKey** compaction_end) {
S
sdong 已提交
567
  auto* result = compaction_picker_->CompactRange(
S
sdong 已提交
568
      GetName(), mutable_cf_options, current_->storage_info(), input_level,
S
sdong 已提交
569 570 571 572 573
      output_level, output_path_id, begin, end, compaction_end);
  if (result != nullptr) {
    result->SetInputVersion(current_);
  }
  return result;
574 575
}

576
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
577
    InstrumentedMutex* db_mutex) {
578
  SuperVersion* sv = nullptr;
I
Igor Canadi 已提交
579 580 581 582
  sv = GetThreadLocalSuperVersion(db_mutex);
  sv->Ref();
  if (!ReturnThreadLocalSuperVersion(sv)) {
    sv->Unref();
583 584 585 586 587
  }
  return sv;
}

SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
588
    InstrumentedMutex* db_mutex) {
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610
  SuperVersion* sv = nullptr;
  // The SuperVersion is cached in thread local storage to avoid acquiring
  // mutex when SuperVersion does not change since the last use. When a new
  // SuperVersion is installed, the compaction or flush thread cleans up
  // cached SuperVersion in all existing thread local storage. To avoid
  // acquiring mutex for this operation, we use atomic Swap() on the thread
  // local pointer to guarantee exclusive access. If the thread local pointer
  // is being used while a new SuperVersion is installed, the cached
  // SuperVersion can become stale. In that case, the background thread would
  // have swapped in kSVObsolete. We re-check the value at when returning
  // SuperVersion back to thread local, with an atomic compare and swap.
  // The superversion will need to be released if detected to be stale.
  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
  // Invariant:
  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
  // (if no Scrape happens).
  assert(ptr != SuperVersion::kSVInUse);
  sv = static_cast<SuperVersion*>(ptr);
  if (sv == SuperVersion::kSVObsolete ||
      sv->version_number != super_version_number_.load()) {
611
    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
612 613 614
    SuperVersion* sv_to_delete = nullptr;

    if (sv && sv->Unref()) {
615
      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638
      db_mutex->Lock();
      // NOTE: underlying resources held by superversion (sst files) might
      // not be released until the next background job.
      sv->Cleanup();
      sv_to_delete = sv;
    } else {
      db_mutex->Lock();
    }
    sv = super_version_->Ref();
    db_mutex->Unlock();

    delete sv_to_delete;
  }
  assert(sv != nullptr);
  return sv;
}

bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
  assert(sv != nullptr);
  // Put the SuperVersion back
  void* expected = SuperVersion::kSVInUse;
  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
C
clark.kang 已提交
639
    // storage has not been altered and no Scrape has happened. The
640 641 642 643 644 645 646 647 648 649 650
    // SuperVersion is still current.
    return true;
  } else {
    // ThreadLocal scrape happened in the process of this GetImpl call (after
    // thread local Swap() at the beginning and before CompareAndSwap()).
    // This means the SuperVersion it holds is obsolete.
    assert(expected == SuperVersion::kSVObsolete);
  }
  return false;
}

651
SuperVersion* ColumnFamilyData::InstallSuperVersion(
652
    SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
653 654 655 656 657
  db_mutex->AssertHeld();
  return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
}

SuperVersion* ColumnFamilyData::InstallSuperVersion(
658
    SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
659
    const MutableCFOptions& mutable_cf_options) {
660
  new_superversion->db_mutex = db_mutex;
661
  new_superversion->mutable_cf_options = mutable_cf_options;
662 663 664 665
  new_superversion->Init(mem_, imm_.current(), current_);
  SuperVersion* old_superversion = super_version_;
  super_version_ = new_superversion;
  ++super_version_number_;
666
  super_version_->version_number = super_version_number_;
667
  // Reset SuperVersions cached in thread local storage
I
Igor Canadi 已提交
668
  ResetThreadLocalSuperVersions();
669

670
  RecalculateWriteStallConditions(mutable_cf_options);
671

672 673 674 675 676
  if (old_superversion != nullptr && old_superversion->Unref()) {
    old_superversion->Cleanup();
    return old_superversion;  // will let caller delete outside of mutex
  }
  return nullptr;
I
Igor Canadi 已提交
677 678
}

679 680
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
  autovector<void*> sv_ptrs;
681
  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
682 683
  for (auto ptr : sv_ptrs) {
    assert(ptr);
684 685 686
    if (ptr == SuperVersion::kSVInUse) {
      continue;
    }
687 688 689 690 691 692 693 694
    auto sv = static_cast<SuperVersion*>(ptr);
    if (sv->Unref()) {
      sv->Cleanup();
      delete sv;
    }
  }
}

I
Igor Canadi 已提交
695
#ifndef ROCKSDB_LITE
696
Status ColumnFamilyData::SetOptions(
697 698
      const std::unordered_map<std::string, std::string>& options_map) {
  MutableCFOptions new_mutable_cf_options;
699 700 701
  Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
                                          &new_mutable_cf_options);
  if (s.ok()) {
702
    mutable_cf_options_ = new_mutable_cf_options;
703
    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
704
  }
705
  return s;
706
}
I
Igor Canadi 已提交
707
#endif  // ROCKSDB_LITE
708

I
Igor Canadi 已提交
709
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
I
Igor Canadi 已提交
710
                                 const DBOptions* db_options,
L
Lei Jin 已提交
711
                                 const EnvOptions& env_options,
712
                                 Cache* table_cache,
713
                                 WriteBuffer* write_buffer,
714
                                 WriteController* write_controller)
715
    : max_column_family_(0),
716
      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
717
                                      ColumnFamilyOptions(), db_options,
718
                                      env_options, nullptr)),
I
Igor Canadi 已提交
719
      default_cfd_cache_(nullptr),
I
Igor Canadi 已提交
720 721
      db_name_(dbname),
      db_options_(db_options),
L
Lei Jin 已提交
722
      env_options_(env_options),
723
      table_cache_(table_cache),
724
      write_buffer_(write_buffer),
I
Igor Canadi 已提交
725
      write_controller_(write_controller) {
726
  // initialize linked list
727 728
  dummy_cfd_->prev_ = dummy_cfd_;
  dummy_cfd_->next_ = dummy_cfd_;
729
}
I
Igor Canadi 已提交
730 731

ColumnFamilySet::~ColumnFamilySet() {
732 733 734 735
  while (column_family_data_.size() > 0) {
    // cfd destructor will delete itself from column_family_data_
    auto cfd = column_family_data_.begin()->second;
    cfd->Unref();
I
Igor Canadi 已提交
736 737
    delete cfd;
  }
738
  dummy_cfd_->Unref();
739
  delete dummy_cfd_;
I
Igor Canadi 已提交
740 741 742
}

ColumnFamilyData* ColumnFamilySet::GetDefault() const {
I
Igor Canadi 已提交
743 744
  assert(default_cfd_cache_ != nullptr);
  return default_cfd_cache_;
I
Igor Canadi 已提交
745 746 747 748 749 750 751 752 753 754 755
}

ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
  auto cfd_iter = column_family_data_.find(id);
  if (cfd_iter != column_family_data_.end()) {
    return cfd_iter->second;
  } else {
    return nullptr;
  }
}

756 757 758
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
    const {
  auto cfd_iter = column_families_.find(name);
I
Igor Canadi 已提交
759 760 761 762 763
  if (cfd_iter != column_families_.end()) {
    auto cfd = GetColumnFamily(cfd_iter->second);
    assert(cfd != nullptr);
    return cfd;
  } else {
764 765
    return nullptr;
  }
I
Igor Canadi 已提交
766 767 768 769 770 771
}

uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
  return ++max_column_family_;
}

772 773 774 775 776 777
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }

void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
  max_column_family_ = std::max(new_max_column_family, max_column_family_);
}

778 779 780 781
size_t ColumnFamilySet::NumberOfColumnFamilies() const {
  return column_families_.size();
}

I
Igor Canadi 已提交
782
// under a DB mutex AND write thread
I
Igor Canadi 已提交
783 784 785 786 787
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
    const std::string& name, uint32_t id, Version* dummy_versions,
    const ColumnFamilyOptions& options) {
  assert(column_families_.find(name) == column_families_.end());
  ColumnFamilyData* new_cfd =
788 789 790
      new ColumnFamilyData(id, name, dummy_versions, table_cache_,
                           write_buffer_, options, db_options_,
                           env_options_, this);
791
  column_families_.insert({name, id});
I
Igor Canadi 已提交
792 793
  column_family_data_.insert({id, new_cfd});
  max_column_family_ = std::max(max_column_family_, id);
794
  // add to linked list
795 796 797 798 799
  new_cfd->next_ = dummy_cfd_;
  auto prev = dummy_cfd_->prev_;
  new_cfd->prev_ = prev;
  prev->next_ = new_cfd;
  dummy_cfd_->prev_ = new_cfd;
I
Igor Canadi 已提交
800 801 802
  if (id == 0) {
    default_cfd_cache_ = new_cfd;
  }
I
Igor Canadi 已提交
803 804 805
  return new_cfd;
}

806 807 808 809
// REQUIRES: DB mutex held
void ColumnFamilySet::FreeDeadColumnFamilies() {
  autovector<ColumnFamilyData*> to_delete;
  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
I
Igor Canadi 已提交
810
    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
811 812 813 814 815 816 817 818 819
      to_delete.push_back(cfd);
    }
  }
  for (auto cfd : to_delete) {
    // this is very rare, so it's not a problem that we do it under a mutex
    delete cfd;
  }
}

I
Igor Canadi 已提交
820
// under a DB mutex AND from a write thread
I
Igor Canadi 已提交
821
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
822
  auto cfd_iter = column_family_data_.find(cfd->GetID());
823 824
  assert(cfd_iter != column_family_data_.end());
  column_family_data_.erase(cfd_iter);
825
  column_families_.erase(cfd->GetName());
I
Igor Canadi 已提交
826 827
}

I
Igor Canadi 已提交
828
// under a DB mutex OR from a write thread
829
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
I
Igor Canadi 已提交
830 831 832 833 834 835
  if (column_family_id == 0) {
    // optimization for common case
    current_ = column_family_set_->GetDefault();
  } else {
    current_ = column_family_set_->GetColumnFamily(column_family_id);
  }
836
  handle_.SetCFD(current_);
837 838
  return current_ != nullptr;
}
839

840 841 842 843 844 845 846 847 848 849
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
  assert(current_ != nullptr);
  return current_->GetLogNumber();
}

MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
  assert(current_ != nullptr);
  return current_->mem();
}

850
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
851
  assert(current_ != nullptr);
852
  return &handle_;
853 854
}

I
Igor Canadi 已提交
855 856 857 858 859 860 861
void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
  if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
    flush_scheduler_->ScheduleFlush(current_);
    current_->mem()->MarkFlushScheduled();
  }
}

862 863 864 865 866 867 868 869 870
uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
  uint32_t column_family_id = 0;
  if (column_family != nullptr) {
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    column_family_id = cfh->GetID();
  }
  return column_family_id;
}

871 872 873 874 875 876 877 878 879
const Comparator* GetColumnFamilyUserComparator(
    ColumnFamilyHandle* column_family) {
  if (column_family != nullptr) {
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    return cfh->user_comparator();
  }
  return nullptr;
}

I
Igor Canadi 已提交
880
}  // namespace rocksdb