builder.cc 14.6 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8 9 10 11
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/builder.h"

I
Igor Canadi 已提交
12
#include <algorithm>
13
#include <deque>
14
#include <vector>
15

16
#include "db/blob/blob_file_builder.h"
17
#include "db/compaction/compaction_iterator.h"
J
jorlow@chromium.org 已提交
18
#include "db/dbformat.h"
19
#include "db/event_helpers.h"
20
#include "db/internal_stats.h"
21
#include "db/merge_helper.h"
22
#include "db/output_validator.h"
23
#include "db/range_del_aggregator.h"
J
jorlow@chromium.org 已提交
24 25
#include "db/table_cache.h"
#include "db/version_edit.h"
26
#include "file/file_util.h"
27
#include "file/filename.h"
28 29
#include "file/read_write_util.h"
#include "file/writable_file_writer.h"
30 31
#include "monitoring/iostats_context_imp.h"
#include "monitoring/thread_status_util.h"
32
#include "options/options_helper.h"
33 34 35
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
S
Siying Dong 已提交
36
#include "rocksdb/options.h"
K
kailiu 已提交
37
#include "rocksdb/table.h"
38
#include "table/block_based/block_based_table_builder.h"
39
#include "table/format.h"
S
sdong 已提交
40
#include "table/internal_iterator.h"
41
#include "test_util/sync_point.h"
42
#include "util/stop_watch.h"
J
jorlow@chromium.org 已提交
43

44
namespace ROCKSDB_NAMESPACE {
J
jorlow@chromium.org 已提交
45

S
Siying Dong 已提交
46 47
class TableFactory;

48
TableBuilder* NewTableBuilder(
49
    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
50 51 52
    const InternalKeyComparator& internal_comparator,
    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories,
53 54
    uint32_t column_family_id, const std::string& column_family_name,
    WritableFileWriter* file, const CompressionType compression_type,
55 56
    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
    int level, const bool skip_filters, const uint64_t creation_time,
S
Sagar Vemuri 已提交
57
    const uint64_t oldest_key_time, const uint64_t target_file_size,
58 59
    const uint64_t file_creation_time, const std::string& db_id,
    const std::string& db_session_id) {
60 61 62
  assert((column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         column_family_name.empty());
63
  return ioptions.table_factory->NewTableBuilder(
64 65
      TableBuilderOptions(ioptions, moptions, internal_comparator,
                          int_tbl_prop_collector_factories, compression_type,
66 67
                          sample_for_compression, compression_opts,
                          skip_filters, column_family_name, level,
S
Sagar Vemuri 已提交
68
                          creation_time, oldest_key_time, target_file_size,
69
                          file_creation_time, db_id, db_session_id),
70
      column_family_id, file);
S
Siying Dong 已提交
71 72
}

73
Status BuildTable(
74 75
    const std::string& dbname, VersionSet* versions,
    const ImmutableDBOptions& db_options, const ImmutableCFOptions& ioptions,
76
    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
77
    TableCache* table_cache, InternalIterator* iter,
78 79
    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
        range_del_iters,
80 81
    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
    const InternalKeyComparator& internal_comparator,
82 83
    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories,
84 85
    uint32_t column_family_id, const std::string& column_family_name,
    std::vector<SequenceNumber> snapshots,
86
    SequenceNumber earliest_write_conflict_snapshot,
Y
Yi Wu 已提交
87
    SnapshotChecker* snapshot_checker, const CompressionType compression,
88 89
    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
    bool paranoid_file_checks, InternalStats* internal_stats,
90
    TableFileCreationReason reason, IOStatus* io_status,
91 92
    const std::shared_ptr<IOTracer>& io_tracer, EventLogger* event_logger,
    int job_id, const Env::IOPriority io_priority,
93 94
    TableProperties* table_properties, int level, const uint64_t creation_time,
    const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint,
95
    const uint64_t file_creation_time, const std::string& db_id,
96 97
    const std::string& db_session_id, const std::string* full_history_ts_low,
    BlobFileCompletionCallback* blob_callback) {
98 99 100
  assert((column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         column_family_name.empty());
101 102
  // Reports the IOStats for flush for every following bytes.
  const size_t kReportFlushIOStatsEvery = 1048576;
103 104 105 106 107
  OutputValidator output_validator(
      internal_comparator,
      /*enable_order_check=*/
      mutable_cf_options.check_flush_compaction_key_order,
      /*enable_hash=*/paranoid_file_checks);
J
jorlow@chromium.org 已提交
108
  Status s;
109
  meta->fd.file_size = 0;
J
jorlow@chromium.org 已提交
110
  iter->SeekToFirst();
111 112
  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
113 114
  for (auto& range_del_iter : range_del_iters) {
    range_del_agg->AddTombstones(std::move(range_del_iter));
A
Andrew Kryczka 已提交
115
  }
J
jorlow@chromium.org 已提交
116

117
  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
118
                                    meta->fd.GetPathId());
119
  std::vector<std::string> blob_file_paths;
120 121
  std::string file_checksum = kUnknownFileChecksum;
  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
122 123 124 125
#ifndef ROCKSDB_LITE
  EventHelpers::NotifyTableFileCreationStarted(
      ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
#endif  // !ROCKSDB_LITE
126 127 128 129
  Env* env = db_options.env;
  assert(env);
  FileSystem* fs = db_options.fs.get();
  assert(fs);
130

131
  TableProperties tp;
132
  if (iter->Valid() || !range_del_agg->IsEmpty()) {
133
    TableBuilder* builder;
134
    std::unique_ptr<WritableFileWriter> file_writer;
135
    {
136
      std::unique_ptr<FSWritableFile> file;
137
#ifndef NDEBUG
138
      bool use_direct_writes = file_options.use_direct_writes;
139 140
      TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
#endif  // !NDEBUG
141
      IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
142
      assert(s.ok());
143 144 145 146
      s = io_s;
      if (io_status->ok()) {
        *io_status = io_s;
      }
147
      if (!s.ok()) {
148 149
        EventHelpers::LogAndNotifyTableFileCreationFinished(
            event_logger, ioptions.listeners, dbname, column_family_name, fname,
150 151
            job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s,
            file_checksum, file_checksum_func_name);
152 153
        return s;
      }
154
      FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
155
      file->SetIOPriority(io_priority);
S
Stream  
Shaohua Li 已提交
156
      file->SetWriteLifeTimeHint(write_hint);
157
      file_writer.reset(new WritableFileWriter(
158
          std::move(file), fname, file_options, ioptions.clock, io_tracer,
159
          ioptions.statistics, ioptions.listeners,
160 161
          ioptions.file_checksum_gen_factory,
          tmp_set.Contains(FileType::kTableFile)));
162

163
      builder = NewTableBuilder(
164 165
          ioptions, mutable_cf_options, internal_comparator,
          int_tbl_prop_collector_factories, column_family_id,
166
          column_family_name, file_writer.get(), compression,
167
          sample_for_compression, compression_opts, level,
S
Sagar Vemuri 已提交
168
          false /* skip_filters */, creation_time, oldest_key_time,
169
          0 /*target_file_size*/, file_creation_time, db_id, db_session_id);
170
    }
171

I
Igor Canadi 已提交
172 173 174
    MergeHelper merge(env, internal_comparator.user_comparator(),
                      ioptions.merge_operator, nullptr, ioptions.info_log,
                      true /* internal key corruption is not ok */,
175 176
                      snapshots.empty() ? 0 : snapshots.back(),
                      snapshot_checker);
177

178 179
    std::unique_ptr<BlobFileBuilder> blob_file_builder(
        (mutable_cf_options.enable_blob_files && blob_file_additions)
180 181 182 183 184
            ? new BlobFileBuilder(versions, fs, &ioptions, &mutable_cf_options,
                                  &file_options, job_id, column_family_id,
                                  column_family_name, io_priority, write_hint,
                                  io_tracer, blob_callback, &blob_file_paths,
                                  blob_file_additions)
185 186
            : nullptr);

187 188
    CompactionIterator c_iter(
        iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
Y
Yi Wu 已提交
189
        &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
190
        ShouldReportDetailedTime(env, ioptions.statistics),
191
        true /* internal key corruption is not ok */, range_del_agg.get(),
192 193 194 195 196
        blob_file_builder.get(), ioptions.allow_data_in_errors,
        /*compaction=*/nullptr,
        /*compaction_filter=*/nullptr, /*shutting_down=*/nullptr,
        /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
        db_options.info_log, full_history_ts_low);
197

198 199 200 201
    c_iter.SeekToFirst();
    for (; c_iter.Valid(); c_iter.Next()) {
      const Slice& key = c_iter.key();
      const Slice& value = c_iter.value();
202
      const ParsedInternalKey& ikey = c_iter.ikey();
203 204 205 206
      // Generate a rolling 64-bit hash of the key and values
      s = output_validator.Add(key, value);
      if (!s.ok()) {
        break;
207
      }
208
      builder->Add(key, value);
209
      meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
210 211

      // TODO(noetzli): Update stats after flush, too.
I
Igor Canadi 已提交
212 213
      if (io_priority == Env::IO_HIGH &&
          IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
214
        ThreadStatusUtil::SetThreadOperationProperty(
I
Igor Canadi 已提交
215
            ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
216
      }
J
jorlow@chromium.org 已提交
217
    }
218 219 220 221 222
    if (!s.ok()) {
      c_iter.status().PermitUncheckedError();
    } else if (!c_iter.status().ok()) {
      s = c_iter.status();
    }
223

224 225 226 227 228 229 230 231 232 233
    if (s.ok()) {
      auto range_del_it = range_del_agg->NewIterator();
      for (range_del_it->SeekToFirst(); range_del_it->Valid();
           range_del_it->Next()) {
        auto tombstone = range_del_it->Tombstone();
        auto kv = tombstone.Serialize();
        builder->Add(kv.first.Encode(), kv.second);
        meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
                                       tombstone.seq_, internal_comparator);
      }
234 235
    }

236
    TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
237
    const bool empty = builder->IsEmpty();
A
Andres Noetzli 已提交
238
    if (!s.ok() || empty) {
J
jorlow@chromium.org 已提交
239
      builder->Abandon();
A
Andres Noetzli 已提交
240 241
    } else {
      s = builder->Finish();
J
jorlow@chromium.org 已提交
242
    }
243
    if (io_status->ok()) {
244
      *io_status = builder->io_status();
245
    }
A
Andres Noetzli 已提交
246 247

    if (s.ok() && !empty) {
248 249
      uint64_t file_size = builder->FileSize();
      meta->fd.file_size = file_size;
250
      meta->marked_for_compaction = builder->NeedCompact();
251
      assert(meta->fd.GetFileSize() > 0);
252
      tp = builder->GetTableProperties(); // refresh now that builder is finished
253
      if (table_properties) {
254
        *table_properties = tp;
255 256
      }
    }
J
jorlow@chromium.org 已提交
257 258 259
    delete builder;

    // Finish and check for file errors
Z
Zhichao Cao 已提交
260
    TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
S
Sagar Vemuri 已提交
261
    if (s.ok() && !empty) {
262
      StopWatch sw(ioptions.clock, ioptions.statistics, TABLE_SYNC_MICROS);
263
      *io_status = file_writer->Sync(ioptions.use_fsync);
J
jorlow@chromium.org 已提交
264
    }
Z
Zhichao Cao 已提交
265
    TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
266
    if (s.ok() && io_status->ok() && !empty) {
267
      *io_status = file_writer->Close();
J
jorlow@chromium.org 已提交
268
    }
269
    if (s.ok() && io_status->ok() && !empty) {
270 271 272
      // Add the checksum information to file metadata.
      meta->file_checksum = file_writer->GetFileChecksum();
      meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
273 274
      file_checksum = meta->file_checksum;
      file_checksum_func_name = meta->file_checksum_func_name;
275 276
    }

277
    if (s.ok()) {
278 279 280
      s = *io_status;
    }

281 282 283
    if (blob_file_builder) {
      if (s.ok()) {
        s = blob_file_builder->Finish();
284 285
      } else {
        blob_file_builder->Abandon();
286 287 288 289
      }
      blob_file_builder.reset();
    }

290
    // TODO Also check the IO status when create the Iterator.
J
jorlow@chromium.org 已提交
291

A
Andres Noetzli 已提交
292
    if (s.ok() && !empty) {
J
jorlow@chromium.org 已提交
293
      // Verify that the table is usable
294 295 296 297 298
      // We set for_compaction to false and don't OptimizeForCompactionTableRead
      // here because this is a special case after we finish the table building
      // No matter whether use_direct_io_for_flush_and_compaction is true,
      // we will regrad this verification as user reads since the goal is
      // to cache it here for further user reads
299
      ReadOptions read_options;
S
sdong 已提交
300
      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
301
          read_options, file_options, internal_comparator, *meta,
302 303
          nullptr /* range_del_agg */,
          mutable_cf_options.prefix_extractor.get(), nullptr,
304 305
          (internal_stats == nullptr) ? nullptr
                                      : internal_stats->GetFileReadHist(0),
306
          TableReaderCaller::kFlush, /*arena=*/nullptr,
307 308 309
          /*skip_filter=*/false, level,
          MaxFileSizeForL0MetaPin(mutable_cf_options),
          /*smallest_compaction_key=*/nullptr,
310 311
          /*largest_compaction_key*/ nullptr,
          /*allow_unprepared_value*/ false));
J
jorlow@chromium.org 已提交
312
      s = it->status();
313
      if (s.ok() && paranoid_file_checks) {
314 315 316
        OutputValidator file_validator(internal_comparator,
                                       /*enable_order_check=*/true,
                                       /*enable_hash=*/true);
A
Andres Noetzli 已提交
317
        for (it->SeekToFirst(); it->Valid(); it->Next()) {
318
          // Generate a rolling 64-bit hash of the key and values
319
          file_validator.Add(it->key(), it->value()).PermitUncheckedError();
A
Andres Noetzli 已提交
320
        }
321
        s = it->status();
322
        if (s.ok() && !output_validator.CompareValidator(file_validator)) {
323
          s = Status::Corruption("Paranoid checksums do not match");
324
        }
325
      }
J
jorlow@chromium.org 已提交
326 327 328 329 330 331 332 333
    }
  }

  // Check for input iterator errors
  if (!iter->status().ok()) {
    s = iter->status();
  }

A
Andres Noetzli 已提交
334
  if (!s.ok() || meta->fd.GetFileSize() == 0) {
335 336
    TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");

337 338
    constexpr IODebugContext* dbg = nullptr;

339
    Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
340
    ignored.PermitUncheckedError();
341 342 343 344 345

    assert(blob_file_additions || blob_file_paths.empty());

    if (blob_file_additions) {
      for (const std::string& blob_file_path : blob_file_paths) {
346 347
        ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
                               /*force_bg=*/false, /*force_fg=*/false);
348
        ignored.PermitUncheckedError();
349
        TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
350 351
      }
    }
J
jorlow@chromium.org 已提交
352
  }
353

354 355 356
  if (meta->fd.GetFileSize() == 0) {
    fname = "(nil)";
  }
357
  // Output to event logger and fire events.
358 359
  EventHelpers::LogAndNotifyTableFileCreationFinished(
      event_logger, ioptions.listeners, dbname, column_family_name, fname,
360 361
      job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s,
      file_checksum, file_checksum_func_name);
362

J
jorlow@chromium.org 已提交
363 364 365
  return s;
}

366
}  // namespace ROCKSDB_NAMESPACE