flush_job.cc 10.9 KB
Newer Older
I
Igor Canadi 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/flush_job.h"

#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#include <inttypes.h>
17

I
Igor Canadi 已提交
18 19 20 21 22 23
#include <algorithm>
#include <vector>

#include "db/builder.h"
#include "db/db_iter.h"
#include "db/dbformat.h"
24
#include "db/event_helpers.h"
I
Igor Canadi 已提交
25 26 27 28 29 30 31 32
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/memtable_list.h"
#include "db/merge_context.h"
#include "db/version_set.h"
#include "port/likely.h"
33
#include "port/port.h"
I
Igor Canadi 已提交
34 35 36 37 38 39 40 41 42 43 44
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/statistics.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "table/block.h"
#include "table/block_based_table_factory.h"
#include "table/merger.h"
#include "table/table_builder.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
I
Igor Canadi 已提交
45
#include "util/event_logger.h"
46
#include "util/file_util.h"
47
#include "util/iostats_context_imp.h"
I
Igor Canadi 已提交
48
#include "util/log_buffer.h"
49
#include "util/logging.h"
I
Igor Canadi 已提交
50 51 52 53
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
#include "util/stop_watch.h"
#include "util/sync_point.h"
54
#include "util/thread_status_util.h"
I
Igor Canadi 已提交
55 56 57 58 59 60 61

namespace rocksdb {

FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                   const DBOptions& db_options,
                   const MutableCFOptions& mutable_cf_options,
                   const EnvOptions& env_options, VersionSet* versions,
62 63
                   InstrumentedMutex* db_mutex,
                   std::atomic<bool>* shutting_down,
I
Igor Canadi 已提交
64 65 66
                   std::vector<SequenceNumber> existing_snapshots,
                   JobContext* job_context, LogBuffer* log_buffer,
                   Directory* db_directory, Directory* output_file_directory,
I
Igor Canadi 已提交
67 68
                   CompressionType output_compression, Statistics* stats,
                   EventLogger* event_logger)
I
Igor Canadi 已提交
69 70 71 72 73 74 75 76
    : dbname_(dbname),
      cfd_(cfd),
      db_options_(db_options),
      mutable_cf_options_(mutable_cf_options),
      env_options_(env_options),
      versions_(versions),
      db_mutex_(db_mutex),
      shutting_down_(shutting_down),
I
Igor Canadi 已提交
77
      existing_snapshots_(std::move(existing_snapshots)),
I
Igor Canadi 已提交
78 79 80
      job_context_(job_context),
      log_buffer_(log_buffer),
      db_directory_(db_directory),
81
      output_file_directory_(output_file_directory),
I
Igor Canadi 已提交
82
      output_compression_(output_compression),
I
Igor Canadi 已提交
83
      stats_(stats),
84 85
      event_logger_(event_logger) {
  // Update the thread status to indicate flush.
86
  ReportStartedFlush();
87 88 89 90 91 92
  TEST_SYNC_POINT("FlushJob::FlushJob()");
}

FlushJob::~FlushJob() {
  ThreadStatusUtil::ResetThreadStatus();
}
I
Igor Canadi 已提交
93

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
void FlushJob::ReportStartedFlush() {
  ThreadStatusUtil::SetColumnFamily(cfd_);
  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
  ThreadStatusUtil::SetThreadOperationProperty(
      ThreadStatus::COMPACTION_JOB_ID,
      job_context_->job_id);
  IOSTATS_RESET(bytes_written);
}

void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
  uint64_t input_size = 0;
  for (auto* mem : mems) {
    input_size += mem->ApproximateMemoryUsage();
  }
  ThreadStatusUtil::IncreaseThreadOperationProperty(
      ThreadStatus::FLUSH_BYTES_MEMTABLES,
      input_size);
}

void FlushJob::RecordFlushIOStats() {
114
  ThreadStatusUtil::SetThreadOperationProperty(
115 116 117
      ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
}

118
Status FlushJob::Run(FileMetaData* file_meta) {
119 120
  AutoThreadOperationStageUpdater stage_run(
      ThreadStatus::STAGE_FLUSH_RUN);
I
Igor Canadi 已提交
121
  // Save the contents of the earliest memtable as a new Table
122
  FileMetaData meta;
I
Igor Canadi 已提交
123 124 125 126 127 128 129 130
  autovector<MemTable*> mems;
  cfd_->imm()->PickMemtablesToFlush(&mems);
  if (mems.empty()) {
    LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
                cfd_->GetName().c_str());
    return Status::OK();
  }

131
  ReportFlushInputSize(mems);
132

I
Igor Canadi 已提交
133 134 135 136 137 138 139 140 141 142 143 144
  // entries mems are (implicitly) sorted in ascending order by their created
  // time. We will use the first memtable's `edit` to keep the meta info for
  // this flush.
  MemTable* m = mems[0];
  VersionEdit* edit = m->GetEdits();
  edit->SetPrevLogNumber(0);
  // SetLogNumber(log_num) indicates logs with number smaller than log_num
  // will no longer be picked up for recovery.
  edit->SetLogNumber(mems.back()->GetNextLogNumber());
  edit->SetColumnFamily(cfd_->GetID());

  // This will release and re-acquire the mutex.
145
  Status s = WriteLevel0Table(mems, edit, &meta);
I
Igor Canadi 已提交
146 147 148 149 150 151 152 153

  if (s.ok() &&
      (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
    s = Status::ShutdownInProgress(
        "Database shutdown or Column family drop during flush");
  }

  if (!s.ok()) {
154
    cfd_->imm()->RollbackMemtableFlush(mems, meta.fd.GetNumber());
I
Igor Canadi 已提交
155
  } else {
156
    TEST_SYNC_POINT("FlushJob::InstallResults");
I
Igor Canadi 已提交
157 158
    // Replace immutable memtable with the generated Table
    s = cfd_->imm()->InstallMemtableFlushResults(
159 160 161
        cfd_, mutable_cf_options_, mems, versions_, db_mutex_,
        meta.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
        log_buffer_);
I
Igor Canadi 已提交
162 163
  }

164 165
  if (s.ok() && file_meta != nullptr) {
    *file_meta = meta;
166
  }
167
  RecordFlushIOStats();
168

169 170 171 172 173 174 175 176 177 178 179
  auto stream = event_logger_->LogToBuffer(log_buffer_);
  stream << "job" << job_context_->job_id << "event"
         << "flush_finished";
  stream << "lsm_state";
  stream.StartArray();
  auto vstorage = cfd_->current()->storage_info();
  for (int level = 0; level < vstorage->num_levels(); ++level) {
    stream << vstorage->NumLevelFiles(level);
  }
  stream.EndArray();

I
Igor Canadi 已提交
180 181 182 183
  return s;
}

Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
184
                                  VersionEdit* edit, FileMetaData* meta) {
185 186
  AutoThreadOperationStageUpdater stage_updater(
      ThreadStatus::STAGE_FLUSH_WRITE_L0);
I
Igor Canadi 已提交
187 188
  db_mutex_->AssertHeld();
  const uint64_t start_micros = db_options_.env->NowMicros();
189
  // path 0 for level 0 file.
190
  meta->fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
I
Igor Canadi 已提交
191 192 193 194 195 196 197 198 199 200 201 202 203

  Version* base = cfd_->current();
  base->Ref();  // it is likely that we do not need this reference
  Status s;
  {
    db_mutex_->Unlock();
    if (log_buffer_) {
      log_buffer_->FlushBufferToLog();
    }
    std::vector<Iterator*> memtables;
    ReadOptions ro;
    ro.total_order_seek = true;
    Arena arena;
204 205
    uint64_t total_num_entries = 0, total_num_deletes = 0;
    size_t total_memory_usage = 0;
I
Igor Canadi 已提交
206
    for (MemTable* m : mems) {
207
      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
208 209
          "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
          cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
I
Igor Canadi 已提交
210
      memtables.push_back(m->NewIterator(ro, &arena));
211 212 213
      total_num_entries += m->num_entries();
      total_num_deletes += m->num_deletes();
      total_memory_usage += m->ApproximateMemoryUsage();
I
Igor Canadi 已提交
214
    }
215 216 217 218 219 220 221

    event_logger_->Log() << "job" << job_context_->job_id << "event"
                         << "flush_started"
                         << "num_memtables" << mems.size() << "num_entries"
                         << total_num_entries << "num_deletes"
                         << total_num_deletes << "memory_usage"
                         << total_memory_usage;
222

223
    TableFileCreationInfo info;
I
Igor Canadi 已提交
224
    {
225 226 227
      ScopedArenaIterator iter(
          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
                             static_cast<int>(memtables.size()), &arena));
228
      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
229
          "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
230
          cfd_->GetName().c_str(), job_context_->job_id, meta->fd.GetNumber());
I
Igor Canadi 已提交
231

232 233
      TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
                               &output_compression_);
234 235 236 237 238 239 240 241 242
      s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
                     cfd_->table_cache(), iter.get(), meta,
                     cfd_->internal_comparator(),
                     cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
                     existing_snapshots_, output_compression_,
                     cfd_->ioptions()->compression_opts,
                     mutable_cf_options_.paranoid_file_checks,
                     cfd_->internal_stats(), Env::IO_HIGH, &table_properties_);
      info.table_properties = table_properties_;
I
Igor Canadi 已提交
243 244
      LogFlush(db_options_.info_log);
    }
245
    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
246 247 248
        "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
        " bytes %s"
        "%s",
249 250 251
        cfd_->GetName().c_str(), job_context_->job_id, meta->fd.GetNumber(),
        meta->fd.GetFileSize(), s.ToString().c_str(),
        meta->marked_for_compaction ? " (needs compaction)" : "");
252 253 254

    // output to event logger
    if (s.ok()) {
255 256 257
      info.db_name = dbname_;
      info.cf_name = cfd_->GetName();
      info.file_path = TableFileName(db_options_.db_paths,
258 259 260
                                     meta->fd.GetNumber(),
                                     meta->fd.GetPathId());
      info.file_size = meta->fd.GetFileSize();
261 262 263
      info.job_id = job_context_->job_id;
      EventHelpers::LogAndNotifyTableFileCreation(
          event_logger_, db_options_.listeners,
264
          meta->fd, info);
265
      TEST_SYNC_POINT("FlushJob::LogAndNotifyTableFileCreation()");
266 267
    }

268 269
    if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
      output_file_directory_->Fsync();
I
Igor Canadi 已提交
270 271 272 273 274 275 276 277 278 279
    }
    db_mutex_->Lock();
  }
  base->Unref();

  // re-acquire the most current version
  base = cfd_->current();

  // Note that if file_size is zero, the file has been deleted and
  // should not be added to the manifest.
280
  if (s.ok() && meta->fd.GetFileSize() > 0) {
I
Igor Canadi 已提交
281 282 283 284
    // if we have more than 1 background thread, then we cannot
    // insert files directly into higher levels because some other
    // threads could be concurrently producing compacted files for
    // that key range.
285 286
    // Add file to L0
    edit->AddFile(0 /* level */, meta->fd.GetNumber(), meta->fd.GetPathId(),
287 288 289
                  meta->fd.GetFileSize(), meta->smallest, meta->largest,
                  meta->smallest_seqno, meta->largest_seqno,
                  meta->marked_for_compaction);
I
Igor Canadi 已提交
290 291 292 293
  }

  InternalStats::CompactionStats stats(1);
  stats.micros = db_options_.env->NowMicros() - start_micros;
294
  stats.bytes_written = meta->fd.GetFileSize();
295
  cfd_->internal_stats()->AddCompactionStats(0 /* level */, stats);
I
Igor Canadi 已提交
296
  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
297 298
                                     meta->fd.GetFileSize());
  RecordTick(stats_, COMPACT_WRITE_BYTES, meta->fd.GetFileSize());
I
Igor Canadi 已提交
299 300 301 302
  return s;
}

}  // namespace rocksdb