builder.cc 9.5 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8 9 10 11
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/builder.h"

I
Igor Canadi 已提交
12
#include <algorithm>
13
#include <deque>
14
#include <vector>
15

16
#include "db/compaction_iterator.h"
J
jorlow@chromium.org 已提交
17
#include "db/dbformat.h"
18
#include "db/event_helpers.h"
19
#include "db/internal_stats.h"
20
#include "db/merge_helper.h"
J
jorlow@chromium.org 已提交
21 22
#include "db/table_cache.h"
#include "db/version_edit.h"
23 24
#include "monitoring/iostats_context_imp.h"
#include "monitoring/thread_status_util.h"
25 26 27
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
S
Siying Dong 已提交
28
#include "rocksdb/options.h"
K
kailiu 已提交
29
#include "rocksdb/table.h"
S
Siying Dong 已提交
30
#include "table/block_based_table_builder.h"
31
#include "table/format.h"
S
sdong 已提交
32
#include "table/internal_iterator.h"
33
#include "util/file_reader_writer.h"
34
#include "util/filename.h"
35
#include "util/stop_watch.h"
36
#include "util/sync_point.h"
J
jorlow@chromium.org 已提交
37

38
namespace rocksdb {
J
jorlow@chromium.org 已提交
39

S
Siying Dong 已提交
40 41
class TableFactory;

42
TableBuilder* NewTableBuilder(
43
    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
44 45 46
    const InternalKeyComparator& internal_comparator,
    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories,
47 48
    uint32_t column_family_id, const std::string& column_family_name,
    WritableFileWriter* file, const CompressionType compression_type,
S
Sagar Vemuri 已提交
49 50
    const CompressionOptions& compression_opts, int level,
    const std::string* compression_dict, const bool skip_filters,
Y
Yi Wu 已提交
51
    const uint64_t creation_time, const uint64_t oldest_key_time) {
52 53 54
  assert((column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         column_family_name.empty());
55
  return ioptions.table_factory->NewTableBuilder(
56 57 58 59 60
      TableBuilderOptions(ioptions, moptions, internal_comparator,
                          int_tbl_prop_collector_factories, compression_type,
                          compression_opts, compression_dict, skip_filters,
                          column_family_name, level, creation_time,
                          oldest_key_time),
61
      column_family_id, file);
S
Siying Dong 已提交
62 63
}

64 65
Status BuildTable(
    const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions,
66 67 68 69
    const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
    TableCache* table_cache, InternalIterator* iter,
    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
    const InternalKeyComparator& internal_comparator,
70 71
    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories,
72 73
    uint32_t column_family_id, const std::string& column_family_name,
    std::vector<SequenceNumber> snapshots,
74
    SequenceNumber earliest_write_conflict_snapshot,
Y
Yi Wu 已提交
75
    SnapshotChecker* snapshot_checker, const CompressionType compression,
76
    const CompressionOptions& compression_opts, bool paranoid_file_checks,
77 78
    InternalStats* internal_stats, TableFileCreationReason reason,
    EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
Y
Yi Wu 已提交
79
    TableProperties* table_properties, int level, const uint64_t creation_time,
S
Stream  
Shaohua Li 已提交
80
    const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint) {
81 82 83
  assert((column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         column_family_name.empty());
84 85
  // Reports the IOStats for flush for every following bytes.
  const size_t kReportFlushIOStatsEvery = 1048576;
J
jorlow@chromium.org 已提交
86
  Status s;
87
  meta->fd.file_size = 0;
J
jorlow@chromium.org 已提交
88
  iter->SeekToFirst();
A
Andrew Kryczka 已提交
89 90 91 92 93 94 95
  std::unique_ptr<RangeDelAggregator> range_del_agg(
      new RangeDelAggregator(internal_comparator, snapshots));
  s = range_del_agg->AddTombstones(std::move(range_del_iter));
  if (!s.ok()) {
    // may be non-ok if a range tombstone key is unparsable
    return s;
  }
J
jorlow@chromium.org 已提交
96

97
  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
98
                                    meta->fd.GetPathId());
99 100 101 102 103 104
#ifndef ROCKSDB_LITE
  EventHelpers::NotifyTableFileCreationStarted(
      ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
#endif  // !ROCKSDB_LITE
  TableProperties tp;

105
  if (iter->Valid() || !range_del_agg->IsEmpty()) {
106 107 108 109
    TableBuilder* builder;
    unique_ptr<WritableFileWriter> file_writer;
    {
      unique_ptr<WritableFile> file;
110 111 112 113
#ifndef NDEBUG
      bool use_direct_writes = env_options.use_direct_writes;
      TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
#endif  // !NDEBUG
S
sdong 已提交
114
      s = NewWritableFile(env, fname, &file, env_options);
115
      if (!s.ok()) {
116 117 118
        EventHelpers::LogAndNotifyTableFileCreationFinished(
            event_logger, ioptions.listeners, dbname, column_family_name, fname,
            job_id, meta->fd, tp, reason, s);
119 120 121
        return s;
      }
      file->SetIOPriority(io_priority);
S
Stream  
Shaohua Li 已提交
122
      file->SetWriteLifeTimeHint(write_hint);
S
Siying Dong 已提交
123

124 125 126
      file_writer.reset(new WritableFileWriter(std::move(file), fname,
                                               env_options, ioptions.statistics,
                                               ioptions.listeners));
127
      builder = NewTableBuilder(
128 129 130 131 132
          ioptions, mutable_cf_options, internal_comparator,
          int_tbl_prop_collector_factories, column_family_id,
          column_family_name, file_writer.get(), compression, compression_opts,
          level, nullptr /* compression_dict */, false /* skip_filters */,
          creation_time, oldest_key_time);
133
    }
134

I
Igor Canadi 已提交
135 136 137
    MergeHelper merge(env, internal_comparator.user_comparator(),
                      ioptions.merge_operator, nullptr, ioptions.info_log,
                      true /* internal key corruption is not ok */,
138 139
                      snapshots.empty() ? 0 : snapshots.back(),
                      snapshot_checker);
140

141 142
    CompactionIterator c_iter(
        iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
Y
Yi Wu 已提交
143
        &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
144
        ShouldReportDetailedTime(env, ioptions.statistics),
145
        true /* internal key corruption is not ok */, range_del_agg.get());
146 147 148 149 150 151 152 153
    c_iter.SeekToFirst();
    for (; c_iter.Valid(); c_iter.Next()) {
      const Slice& key = c_iter.key();
      const Slice& value = c_iter.value();
      builder->Add(key, value);
      meta->UpdateBoundaries(key, c_iter.ikey().sequence);

      // TODO(noetzli): Update stats after flush, too.
I
Igor Canadi 已提交
154 155
      if (io_priority == Env::IO_HIGH &&
          IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
156
        ThreadStatusUtil::SetThreadOperationProperty(
I
Igor Canadi 已提交
157
            ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
158
      }
J
jorlow@chromium.org 已提交
159
    }
160 161 162 163 164 165 166 167

    for (auto it = range_del_agg->NewIterator(); it->Valid(); it->Next()) {
      auto tombstone = it->Tombstone();
      auto kv = tombstone.Serialize();
      builder->Add(kv.first.Encode(), kv.second);
      meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
                                     tombstone.seq_, internal_comparator);
    }
J
jorlow@chromium.org 已提交
168 169

    // Finish and check for builder errors
170 171
    tp = builder->GetTableProperties();
    bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
172
    s = c_iter.status();
A
Andres Noetzli 已提交
173
    if (!s.ok() || empty) {
J
jorlow@chromium.org 已提交
174
      builder->Abandon();
A
Andres Noetzli 已提交
175 176
    } else {
      s = builder->Finish();
J
jorlow@chromium.org 已提交
177
    }
A
Andres Noetzli 已提交
178 179

    if (s.ok() && !empty) {
180 181
      uint64_t file_size = builder->FileSize();
      meta->fd.file_size = file_size;
182
      meta->marked_for_compaction = builder->NeedCompact();
183
      assert(meta->fd.GetFileSize() > 0);
184
      tp = builder->GetTableProperties(); // refresh now that builder is finished
185
      if (table_properties) {
186
        *table_properties = tp;
187 188
      }
    }
J
jorlow@chromium.org 已提交
189 190 191
    delete builder;

    // Finish and check for file errors
S
Sagar Vemuri 已提交
192
    if (s.ok() && !empty) {
193
      StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
194
      s = file_writer->Sync(ioptions.use_fsync);
J
jorlow@chromium.org 已提交
195
    }
A
Andres Noetzli 已提交
196
    if (s.ok() && !empty) {
197
      s = file_writer->Close();
J
jorlow@chromium.org 已提交
198 199
    }

A
Andres Noetzli 已提交
200
    if (s.ok() && !empty) {
J
jorlow@chromium.org 已提交
201
      // Verify that the table is usable
202 203 204 205 206
      // We set for_compaction to false and don't OptimizeForCompactionTableRead
      // here because this is a special case after we finish the table building
      // No matter whether use_direct_io_for_flush_and_compaction is true,
      // we will regrad this verification as user reads since the goal is
      // to cache it here for further user reads
S
sdong 已提交
207
      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
208
          ReadOptions(), env_options, internal_comparator, *meta,
209 210
          nullptr /* range_del_agg */,
          mutable_cf_options.prefix_extractor.get(), nullptr,
211 212
          (internal_stats == nullptr) ? nullptr
                                      : internal_stats->GetFileReadHist(0),
213 214
          false /* for_compaction */, nullptr /* arena */,
          false /* skip_filter */, level));
J
jorlow@chromium.org 已提交
215
      s = it->status();
216
      if (s.ok() && paranoid_file_checks) {
A
Andres Noetzli 已提交
217 218
        for (it->SeekToFirst(); it->Valid(); it->Next()) {
        }
219 220
        s = it->status();
      }
J
jorlow@chromium.org 已提交
221 222 223 224 225 226 227 228
    }
  }

  // Check for input iterator errors
  if (!iter->status().ok()) {
    s = iter->status();
  }

A
Andres Noetzli 已提交
229
  if (!s.ok() || meta->fd.GetFileSize() == 0) {
J
jorlow@chromium.org 已提交
230 231
    env->DeleteFile(fname);
  }
232 233

  // Output to event logger and fire events.
234 235 236
  EventHelpers::LogAndNotifyTableFileCreationFinished(
      event_logger, ioptions.listeners, dbname, column_family_name, fname,
      job_id, meta->fd, tp, reason, s);
237

J
jorlow@chromium.org 已提交
238 239 240
  return s;
}

241
}  // namespace rocksdb