file_reader_writer.h 14.2 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5 6 7 8
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9

10
#pragma once
A
Aaron Gao 已提交
11
#include <atomic>
Z
Zhichao Cao 已提交
12
#include <sstream>
S
sdong 已提交
13
#include <string>
A
Aaron Gao 已提交
14
#include "port/port.h"
15
#include "rocksdb/env.h"
16
#include "rocksdb/listener.h"
17
#include "rocksdb/rate_limiter.h"
18
#include "test_util/sync_point.h"
19
#include "util/aligned_buffer.h"
20 21

namespace rocksdb {
22 23

class Statistics;
24
class HistogramImpl;
25

26 27 28 29 30 31 32 33 34 35
// This file provides the following main abstractions:
// SequentialFileReader : wrapper over Env::SequentialFile
// RandomAccessFileReader : wrapper over Env::RandomAccessFile
// WritableFileWriter : wrapper over Env::WritableFile
// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
// and ReadOneLine primitives.

// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
// always prefetch additional data with every read. This is mainly used in
// Compaction Table Readers.
36
std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
37
  std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
38

39 40 41
// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
// cache disabled) reads appropriately, and also updates the IO stats.
42 43 44
class SequentialFileReader {
 private:
  std::unique_ptr<SequentialFile> file_;
45
  std::string file_name_;
A
Aaron Gao 已提交
46
  std::atomic<size_t> offset_;  // read offset
47 48

 public:
49 50 51
  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
                                const std::string& _file_name)
      : file_(std::move(_file)), file_name_(_file_name), offset_(0) {}
52 53 54 55 56 57 58 59 60 61

  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
    *this = std::move(o);
  }

  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
    file_ = std::move(o.file_);
    return *this;
  }

S
sdong 已提交
62 63
  SequentialFileReader(const SequentialFileReader&) = delete;
  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
64

65 66 67 68
  Status Read(size_t n, Slice* result, char* scratch);

  Status Skip(uint64_t n);

A
Anirban Rahut 已提交
69 70
  void Rewind();

71
  SequentialFile* file() { return file_.get(); }
A
Aaron Gao 已提交
72

73 74
  std::string file_name() { return file_name_; }

75
  bool use_direct_io() const { return file_->use_direct_io(); }
76 77
};

78 79 80 81 82 83
// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
// responsible for:
// - Handling Buffered and Direct reads appropriately.
// - Rate limiting compaction reads.
// - Notifying any interested listeners on the completion of a read.
// - Updating IO stats.
84
class RandomAccessFileReader {
85
 private:
86
#ifndef ROCKSDB_LITE
87 88 89
  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
                              const FileOperationInfo::TimePoint& start_ts,
                              const FileOperationInfo::TimePoint& finish_ts,
90
                              const Status& status) const {
91
    FileOperationInfo info(file_name_, start_ts, finish_ts);
92 93 94 95 96 97 98 99 100 101 102 103
    info.offset = offset;
    info.length = length;
    info.status = status;

    for (auto& listener : listeners_) {
      listener->OnFileReadFinish(info);
    }
  }
#endif  // ROCKSDB_LITE

  bool ShouldNotifyListeners() const { return !listeners_.empty(); }

104
  std::unique_ptr<RandomAccessFile> file_;
105
  std::string     file_name_;
106 107 108 109
  Env*            env_;
  Statistics*     stats_;
  uint32_t        hist_type_;
  HistogramImpl*  file_read_hist_;
110 111
  RateLimiter* rate_limiter_;
  bool for_compaction_;
112
  std::vector<std::shared_ptr<EventListener>> listeners_;
113 114

 public:
115 116 117 118 119 120
  explicit RandomAccessFileReader(
      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
      HistogramImpl* file_read_hist = nullptr,
      RateLimiter* rate_limiter = nullptr, bool for_compaction = false,
      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
121
      : file_(std::move(raf)),
122
        file_name_(std::move(_file_name)),
123 124
        env_(env),
        stats_(stats),
125
        hist_type_(hist_type),
126 127
        file_read_hist_(file_read_hist),
        rate_limiter_(rate_limiter),
128 129 130 131 132 133 134 135 136 137 138 139 140
        for_compaction_(for_compaction),
        listeners_() {
#ifndef ROCKSDB_LITE
    std::for_each(listeners.begin(), listeners.end(),
                  [this](const std::shared_ptr<EventListener>& e) {
                    if (e->ShouldBeNotifiedOnFileIO()) {
                      listeners_.emplace_back(e);
                    }
                  });
#else  // !ROCKSDB_LITE
    (void)listeners;
#endif
  }
141

142 143 144 145
  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
    *this = std::move(o);
  }

146 147
  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
      ROCKSDB_NOEXCEPT {
148 149 150 151 152
    file_ = std::move(o.file_);
    env_ = std::move(o.env_);
    stats_ = std::move(o.stats_);
    hist_type_ = std::move(o.hist_type_);
    file_read_hist_ = std::move(o.file_read_hist_);
153 154
    rate_limiter_ = std::move(o.rate_limiter_);
    for_compaction_ = std::move(o.for_compaction_);
155 156 157 158 159 160
    return *this;
  }

  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;

161 162
  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
              bool for_compaction = false) const;
163

A
anand76 已提交
164 165
  Status MultiRead(ReadRequest* reqs, size_t num_reqs) const;

A
Aaron Gao 已提交
166 167 168 169
  Status Prefetch(uint64_t offset, size_t n) const {
    return file_->Prefetch(offset, n);
  }

170
  RandomAccessFile* file() { return file_.get(); }
A
Aaron Gao 已提交
171

172 173
  std::string file_name() const { return file_name_; }

174
  bool use_direct_io() const { return file_->use_direct_io(); }
175 176
};

177 178 179 180 181 182 183
// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
// facilities to:
// - Handle Buffered and Direct writes.
// - Rate limit writes.
// - Flush and Sync the data to the underlying filesystem.
// - Notify any interested listeners on the completion of a write.
// - Update IO stats.
184 185
class WritableFileWriter {
 private:
186
#ifndef ROCKSDB_LITE
187 188 189
  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
                               const FileOperationInfo::TimePoint& start_ts,
                               const FileOperationInfo::TimePoint& finish_ts,
190
                               const Status& status) {
191
    FileOperationInfo info(file_name_, start_ts, finish_ts);
192 193 194 195 196 197 198 199 200 201 202 203
    info.offset = offset;
    info.length = length;
    info.status = status;

    for (auto& listener : listeners_) {
      listener->OnFileWriteFinish(info);
    }
  }
#endif  // ROCKSDB_LITE

  bool ShouldNotifyListeners() const { return !listeners_.empty(); }

204
  std::unique_ptr<WritableFile> writable_file_;
205
  std::string file_name_;
206
  Env* env_;
207
  AlignedBuffer           buf_;
208
  size_t                  max_buffer_size_;
209 210 211
  // Actually written data size can be used for truncate
  // not counting padding data
  uint64_t                filesize_;
S
Siying Dong 已提交
212
#ifndef ROCKSDB_LITE
213 214 215 216
  // This is necessary when we use unbuffered access
  // and writes must happen on aligned offsets
  // so we need to go back and write that page again
  uint64_t                next_write_offset_;
S
Siying Dong 已提交
217
#endif  // ROCKSDB_LITE
218 219 220 221
  bool                    pending_sync_;
  uint64_t                last_sync_size_;
  uint64_t                bytes_per_sync_;
  RateLimiter*            rate_limiter_;
222
  Statistics* stats_;
223
  std::vector<std::shared_ptr<EventListener>> listeners_;
224 225

 public:
226 227
  WritableFileWriter(
      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
228 229
      const EnvOptions& options, Env* env = nullptr,
      Statistics* stats = nullptr,
230
      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
231
      : writable_file_(std::move(file)),
232
        file_name_(_file_name),
233
        env_(env),
234
        buf_(),
235
        max_buffer_size_(options.writable_file_max_buffer_size),
236
        filesize_(0),
S
Siying Dong 已提交
237
#ifndef ROCKSDB_LITE
238
        next_write_offset_(0),
S
Siying Dong 已提交
239
#endif  // ROCKSDB_LITE
240 241 242
        pending_sync_(false),
        last_sync_size_(0),
        bytes_per_sync_(options.bytes_per_sync),
243
        rate_limiter_(options.rate_limiter),
244 245
        stats_(stats),
        listeners_() {
246 247
    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
                             reinterpret_cast<void*>(max_buffer_size_));
248
    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
249
    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
250 251 252 253 254 255 256 257 258 259
#ifndef ROCKSDB_LITE
    std::for_each(listeners.begin(), listeners.end(),
                  [this](const std::shared_ptr<EventListener>& e) {
                    if (e->ShouldBeNotifiedOnFileIO()) {
                      listeners_.emplace_back(e);
                    }
                  });
#else  // !ROCKSDB_LITE
    (void)listeners;
#endif
260 261 262 263 264 265 266
  }

  WritableFileWriter(const WritableFileWriter&) = delete;

  WritableFileWriter& operator=(const WritableFileWriter&) = delete;

  ~WritableFileWriter() { Close(); }
267

268 269
  std::string file_name() const { return file_name_; }

270 271
  Status Append(const Slice& data);

272 273
  Status Pad(const size_t pad_bytes);

274 275 276 277 278 279
  Status Flush();

  Status Close();

  Status Sync(bool use_fsync);

280 281 282 283 284
  // Sync only the data that was already Flush()ed. Safe to call concurrently
  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
  // returns NotSupported status.
  Status SyncWithoutFlush(bool use_fsync);

285
  uint64_t GetFileSize() const { return filesize_; }
286 287 288 289 290 291 292

  Status InvalidateCache(size_t offset, size_t length) {
    return writable_file_->InvalidateCache(offset, length);
  }

  WritableFile* writable_file() const { return writable_file_.get(); }

293
  bool use_direct_io() { return writable_file_->use_direct_io(); }
A
Aaron Gao 已提交
294

295 296
  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }

297
 private:
298
  // Used when os buffering is OFF and we are writing
A
Aaron Gao 已提交
299
  // DMA such as in Direct I/O mode
A
Aaron Gao 已提交
300
#ifndef ROCKSDB_LITE
A
Aaron Gao 已提交
301
  Status WriteDirect();
A
Aaron Gao 已提交
302
#endif  // !ROCKSDB_LITE
303 304
  // Normal write
  Status WriteBuffered(const char* data, size_t size);
305
  Status RangeSync(uint64_t offset, uint64_t nbytes);
306
  Status SyncInternal(bool use_fsync);
307
};
S
sdong 已提交
308

309
// FilePrefetchBuffer is a smart buffer to store and read data from a file.
310 311
class FilePrefetchBuffer {
 public:
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
  // Constructor.
  //
  // All arguments are optional.
  // file_reader        : the file reader to use. Can be a nullptr.
  // readahead_size     : the initial readahead size.
  // max_readahead_size : the maximum readahead size.
  //   If max_readahead_size > readahead_size, the readahead size will be
  //   doubled on every IO until max_readahead_size is hit.
  //   Typically this is set as a multiple of readahead_size.
  //   max_readahead_size should be greater than equal to readahead_size.
  // enable : controls whether reading from the buffer is enabled.
  //   If false, TryReadFromCache() always return false, and we only take stats
  //   for the minimum offset if track_min_offset = true.
  // track_min_offset : Track the minimum offset ever read and collect stats on
  //   it. Used for adaptable readahead of the file footer/metadata.
  //
  // Automatic readhead is enabled for a file if file_reader, readahead_size,
  // and max_readahead_size are passed in.
  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
  // does not make any sense. So it does nothing.
  // A user can construct a FilePrefetchBuffer without any arguments, but use
  // `Prefetch` to load data into the buffer.
334
  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
335 336
                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
                     bool enable = true, bool track_min_offset = false)
337 338 339
      : buffer_offset_(0),
        file_reader_(file_reader),
        readahead_size_(readadhead_size),
340 341 342 343
        max_readahead_size_(max_readahead_size),
        min_offset_read_(port::kMaxSizet),
        enable_(enable),
        track_min_offset_(track_min_offset) {}
344 345 346 347 348

  // Load data into the buffer from a file.
  // reader : the file reader.
  // offset : the file offset to start reading from.
  // n      : the number of bytes to read.
349 350 351
  // for_compaction : if prefetch is done for compaction read.
  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
                  bool for_compaction = false);
352 353 354 355 356 357 358 359 360 361

  // Tries returning the data for a file raed from this buffer, if that data is
  // in the buffer.
  // It handles tracking the minimum read offset if track_min_offset = true.
  // It also does the exponential readahead when readadhead_size is set as part
  // of the constructor.
  //
  // offset : the file offset.
  // n      : the number of bytes.
  // result : output buffer to put the data into.
362 363 364
  // for_compaction : if cache read is done for compaction read.
  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
                        bool for_compaction = false);
365

366 367
  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
  // tracked if track_min_offset = true.
368 369
  size_t min_offset_read() const { return min_offset_read_; }

370 371 372
 private:
  AlignedBuffer buffer_;
  uint64_t buffer_offset_;
373 374 375
  RandomAccessFileReader* file_reader_;
  size_t readahead_size_;
  size_t max_readahead_size_;
376 377 378 379 380 381 382 383
  // The minimum `offset` ever passed to TryReadFromCache().
  size_t min_offset_read_;
  // if false, TryReadFromCache() always return false, and we only take stats
  // for track_min_offset_ if track_min_offset_ = true
  bool enable_;
  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
  // can be fetched from min_offset_read().
  bool track_min_offset_;
384 385
};

386 387 388 389 390 391
// Returns a WritableFile.
//
// env     : the Env.
// fname   : the file name.
// result  : output arg. A WritableFile based on `fname` returned.
// options : the Env Options.
S
sdong 已提交
392
extern Status NewWritableFile(Env* env, const std::string& fname,
393
                              std::unique_ptr<WritableFile>* result,
S
sdong 已提交
394
                              const EnvOptions& options);
395 396

// Read a single line from a file.
Z
Zhichao Cao 已提交
397 398 399
bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
                 std::string* output, bool* has_data, Status* result);

400
}  // namespace rocksdb