diff --git a/HISTORY.md b/HISTORY.md index bbf5bb5a19a8644b8db3c41e460069ebbb4b56a3..5ee1b72487efb74929ac15e4834a37c1408829ba 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,12 +5,14 @@ * Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction. * options.delayed_write_rate by default take the value of options.rate_limiter rate. * Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`. +* DB property "rocksdb.sstables" now prints keys in hex form. ### New Features * Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads. * Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit. * Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit. * Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind. +* Measure estimated number of reads per file. The information can be accessed through DB::GetColumnFamilyMetaData or "rocksdb.sstables" DB property. ## 5.5.0 (05/17/2017) ### New Features diff --git a/db/compaction.h b/db/compaction.h index 954a90a8350ea3e20c2d4e137b815e9fc7f3c85f..457c2cd075dd39b1daf5e9acd50025fa5ab7fd12 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -101,7 +101,8 @@ class Compaction { // input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and // < "input_levels()" - const std::vector* inputs(size_t compaction_input_level) { + const std::vector* inputs( + size_t compaction_input_level) const { assert(compaction_input_level < inputs_.size()); return &inputs_[compaction_input_level].files; } diff --git a/db/internal_stats.cc b/db/internal_stats.cc index c23efd565137638db4a330690833c355e7ca57d2..c2a528e831ce5c19f5359100e58c64f2a71511e7 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -544,7 +544,7 @@ bool InternalStats::HandleDBStats(std::string* value, Slice suffix) { bool InternalStats::HandleSsTables(std::string* value, Slice suffix) { auto* current = cfd_->current(); - *value = current->DebugString(); + *value = current->DebugString(true, true); return true; } diff --git a/db/version_edit.h b/db/version_edit.h index bdfe81533ebd31d2d9234982e9f347ac00afe90e..72b0522773a9f7d9f3c28b43a8e0120db52ea5c9 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -63,18 +63,30 @@ struct FileDescriptor { uint64_t GetFileSize() const { return file_size; } }; +struct FileSampledStats { + FileSampledStats() : num_reads_sampled(0) {} + FileSampledStats(const FileSampledStats& other) { *this = other; } + FileSampledStats& operator=(const FileSampledStats& other) { + num_reads_sampled = other.num_reads_sampled.load(); + return *this; + } + + // number of user reads to this file. + mutable std::atomic num_reads_sampled; +}; + struct FileMetaData { - int refs; FileDescriptor fd; InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table - bool being_compacted; // Is this file undergoing compaction? SequenceNumber smallest_seqno; // The smallest seqno in this file SequenceNumber largest_seqno; // The largest seqno in this file // Needs to be disposed when refs becomes 0. Cache::Handle* table_reader_handle; + FileSampledStats stats; + // Stats for compensating deletion entries during compaction // File size compensated by deletion entry. @@ -87,6 +99,10 @@ struct FileMetaData { uint64_t num_deletions; // the number of deletion entries. uint64_t raw_key_size; // total uncompressed key size. uint64_t raw_value_size; // total uncompressed value size. + + int refs; // Reference count + + bool being_compacted; // Is this file undergoing compaction? bool init_stats_from_file; // true if the data-entry stats of this file // has initialized from file. @@ -94,9 +110,7 @@ struct FileMetaData { // file. FileMetaData() - : refs(0), - being_compacted(false), - smallest_seqno(kMaxSequenceNumber), + : smallest_seqno(kMaxSequenceNumber), largest_seqno(0), table_reader_handle(nullptr), compensated_file_size(0), @@ -104,6 +118,8 @@ struct FileMetaData { num_deletions(0), raw_key_size(0), raw_value_size(0), + refs(0), + being_compacted(false), init_stats_from_file(false), marked_for_compaction(false) {} @@ -119,10 +135,12 @@ struct FileMetaData { } }; -// A compressed copy of file meta data that just contain -// smallest and largest key's slice +// A compressed copy of file meta data that just contain minimum data needed +// to server read operations, while still keeping the pointer to full metadata +// of the file in case it is needed. struct FdWithKeyRange { FileDescriptor fd; + FileMetaData* file_metadata; // Point to all metadata Slice smallest_key; // slice that contain smallest key Slice largest_key; // slice that contain largest key @@ -132,8 +150,12 @@ struct FdWithKeyRange { largest_key() { } - FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key) - : fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {} + FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, + FileMetaData* _file_metadata) + : fd(_fd), + file_metadata(_file_metadata), + smallest_key(_smallest_key), + largest_key(_largest_key) {} }; // Data structure to store an array of FdWithKeyRange in one level diff --git a/db/version_set.cc b/db/version_set.cc index ae284c0850f134a99d57dc43f9c9ed2cff03c7f5..cfd905c34db9176454719d95a761dfa9db4a481c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -34,6 +34,7 @@ #include "db/pinned_iterators_manager.h" #include "db/table_cache.h" #include "db/version_builder.h" +#include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -122,7 +123,7 @@ class FilePicker { } } - int GetCurrentLevel() { return returned_file_level_; } + int GetCurrentLevel() const { return curr_level_; } FdWithKeyRange* GetNextFile() { while (!search_ended_) { // Loops over different levels. @@ -227,9 +228,7 @@ class FilePicker { unsigned int hit_file_level_; int32_t search_left_bound_; int32_t search_right_bound_; -#ifndef NDEBUG std::vector* files_; -#endif autovector* level_files_brief_; bool search_ended_; bool is_hit_file_last_in_level_; @@ -370,6 +369,7 @@ void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, FdWithKeyRange& f = file_level->files[i]; f.fd = files[i]->fd; + f.file_metadata = files[i]; f.smallest_key = Slice(mem, smallest_size); f.largest_key = Slice(mem + smallest_size, largest_size); } @@ -437,12 +437,12 @@ namespace { class LevelFileNumIterator : public InternalIterator { public: LevelFileNumIterator(const InternalKeyComparator& icmp, - const LevelFilesBrief* flevel) + const LevelFilesBrief* flevel, bool should_sample) : icmp_(icmp), flevel_(flevel), index_(static_cast(flevel->num_files)), - current_value_(0, 0, 0) { // Marks as invalid - } + current_value_(0, 0, 0), // Marks as invalid + should_sample_(should_sample) {} virtual bool Valid() const override { return index_ < flevel_->num_files; } virtual void Seek(const Slice& target) override { index_ = FindFile(icmp_, *flevel_, target); @@ -477,6 +477,9 @@ class LevelFileNumIterator : public InternalIterator { assert(Valid()); auto file_meta = flevel_->files[index_]; + if (should_sample_) { + sample_file_read_inc(file_meta.file_metadata); + } current_value_ = file_meta.fd; return Slice(reinterpret_cast(¤t_value_), sizeof(FileDescriptor)); @@ -488,6 +491,7 @@ class LevelFileNumIterator : public InternalIterator { const LevelFilesBrief* flevel_; uint32_t index_; mutable FileDescriptor current_value_; + bool should_sample_; }; class LevelFileIteratorState : public TwoLevelIteratorState { @@ -745,13 +749,11 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file_path = ioptions->db_paths.back().path; } files.emplace_back( - MakeTableFileName("", file->fd.GetNumber()), - file_path, - file->fd.GetFileSize(), - file->smallest_seqno, - file->largest_seqno, + MakeTableFileName("", file->fd.GetNumber()), file_path, + file->fd.GetFileSize(), file->smallest_seqno, file->largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), + file->stats.num_reads_sampled.load(std::memory_order_relaxed), file->being_compacted); level_size += file->fd.GetFileSize(); } @@ -835,6 +837,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, return; } + bool should_sample = should_sample_file_read(); + auto* arena = merge_iter_builder->GetArena(); if (level == 0) { // Merge all level zero files together since they may overlap @@ -845,6 +849,15 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, range_del_agg, nullptr, cfd_->internal_stats()->GetFileReadHist(0), false, arena, false /* skip_filters */, 0 /* level */)); } + if (should_sample) { + // Count ones for every L0 files. This is done per iterator creation + // rather than Seek(), while files in other levels are recored per seek. + // If users execute one range query per iterator, there may be some + // discrepancy here. + for (FileMetaData* meta : storage_info_.LevelFiles(0)) { + sample_file_read_inc(meta); + } + } } else { // For levels > 0, we can use a concatenating iterator that sequentially // walks through the non-overlapping files in the level, opening them @@ -859,7 +872,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, IsFilterSkipped(level), level, range_del_agg); mem = arena->AllocateAligned(sizeof(LevelFileNumIterator)); auto* first_level_iter = new (mem) LevelFileNumIterator( - cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level)); + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + should_sample_file_read()); merge_iter_builder->AddIterator( NewTwoLevelIterator(state, first_level_iter, arena, false)); } @@ -984,6 +998,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } *status = table_cache_->Get( read_options, *internal_comparator(), f->fd, ikey, &get_context, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), @@ -2201,13 +2218,16 @@ void Version::AddLiveFiles(std::vector* live) { } } -std::string Version::DebugString(bool hex) const { +std::string Version::DebugString(bool hex, bool print_stats) const { std::string r; for (int level = 0; level < storage_info_.num_levels_; level++) { // E.g., // --- level 1 --- // 17:123['a' .. 'd'] // 20:43['e' .. 'g'] + // + // if print_stats=true: + // 17:123['a' .. 'd'](4096) r.append("--- level "); AppendNumberTo(&r, level); r.append(" --- version# "); @@ -2223,7 +2243,14 @@ std::string Version::DebugString(bool hex) const { r.append(files[i]->smallest.DebugString(hex)); r.append(" .. "); r.append(files[i]->largest.DebugString(hex)); - r.append("]\n"); + r.append("]"); + if (print_stats) { + r.append("("); + r.append(ToString( + files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed))); + r.append(")"); + } + r.append("\n"); } } return r; @@ -3533,7 +3560,8 @@ InternalIterator* VersionSet::MakeInputIterator( false /* skip_filters */, (int)which /* level */, range_del_agg), new LevelFileNumIterator(cfd->internal_comparator(), - c->input_levels(which))); + c->input_levels(which), + false /* don't sample compaction */)); } } } diff --git a/db/version_set.h b/db/version_set.h index 7d94d692c1cacdf55a3e907980abc0431d889f0d..f1f0dcb6428dde1faaf66330f0e20ba047753abf 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -504,7 +504,7 @@ class Version { void AddLiveFiles(std::vector* live); // Return a human readable string that describes this version's contents. - std::string DebugString(bool hex = false) const; + std::string DebugString(bool hex = false, bool print_stats = false) const; // Returns the version nuber of this version uint64_t GetVersionNumber() const { return version_number_; } diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 5425146d7b9a611ed4d9f6dbf258e0151a93d8bc..0b83826330869e6cb1fced54f28fb6b09c7d3892 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -55,17 +55,21 @@ struct LevelMetaData { // The metadata that describes a SST file. struct SstFileMetaData { SstFileMetaData() {} - SstFileMetaData(const std::string& _file_name, - const std::string& _path, uint64_t _size, - SequenceNumber _smallest_seqno, + SstFileMetaData(const std::string& _file_name, const std::string& _path, + uint64_t _size, SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, - const std::string& _largestkey, - bool _being_compacted) : - size(_size), name(_file_name), - db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), - smallestkey(_smallestkey), largestkey(_largestkey), - being_compacted(_being_compacted) {} + const std::string& _largestkey, uint64_t _num_reads_sampled, + bool _being_compacted) + : size(_size), + name(_file_name), + db_path(_path), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno), + smallestkey(_smallestkey), + largestkey(_largestkey), + num_reads_sampled(_num_reads_sampled), + being_compacted(_being_compacted) {} // File size in bytes. uint64_t size; @@ -78,6 +82,7 @@ struct SstFileMetaData { SequenceNumber largest_seqno; // Largest sequence number in file. std::string smallestkey; // Smallest user defined key in the file. std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled; // How many times the file is read. bool being_compacted; // true if the file is currently being compacted. }; @@ -86,7 +91,4 @@ struct LiveFileMetaData : SstFileMetaData { std::string column_family_name; // Name of the column family int level; // Level at which this file resides. }; - - - } // namespace rocksdb diff --git a/monitoring/file_read_sample.h b/monitoring/file_read_sample.h new file mode 100644 index 0000000000000000000000000000000000000000..2cefe5522684ce2144fba6370ff7d6be55342138 --- /dev/null +++ b/monitoring/file_read_sample.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. +// +#pragma once +#include "db/version_edit.h" +#include "util/random.h" + +namespace rocksdb { +static const uint32_t kFileReadSampleRate = 1024; +extern bool should_sample_file_read(); +extern void sample_file_read_inc(FileMetaData*); + +inline bool should_sample_file_read() { + return (Random::GetTLSInstance()->Next() % kFileReadSampleRate == 307); +} + +inline void sample_file_read_inc(FileMetaData* meta) { + meta->stats.num_reads_sampled.fetch_add(kFileReadSampleRate, + std::memory_order_relaxed); +} +} diff --git a/table/get_context.cc b/table/get_context.cc index 9532f36542351f3dcec1932c29e62d8fbda11e3b..060bd62b96d64ed8de31fd40c882173cf408f84e 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -8,6 +8,7 @@ #include "table/get_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" +#include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" @@ -59,6 +60,7 @@ GetContext::GetContext(const Comparator* ucmp, if (seq_) { *seq_ = kMaxSequenceNumber; } + sample_ = should_sample_file_read(); } // Called from TableCache::Get and Table::Get when file/block in which diff --git a/table/get_context.h b/table/get_context.h index ec33368aa96f87faefef05bb94c3b77bf7af94d9..d58753346e6a7d0e66c1d6dd2c8a6d9ee7f69fd0 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -62,6 +62,8 @@ class GetContext { // Do we need to fetch the SequenceNumber for this key? bool NeedToReadSequence() const { return (seq_ != nullptr); } + bool sample() const { return sample_; } + private: const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -82,6 +84,7 @@ class GetContext { std::string* replay_log_; // Used to temporarily pin blocks when state_ == GetContext::kMerge PinnedIteratorsManager* pinned_iters_mgr_; + bool sample_; }; void replayGetContextLog(const Slice& replay_log, const Slice& user_key,