提交 0143abdb 编写于 作者: I Igor Canadi

Merge branch 'master' into columnfamilies

Conflicts:
	HISTORY.md
	db/db_impl.cc
	db/db_impl.h
	db/db_iter.cc
	db/db_test.cc
	db/dbformat.h
	db/memtable.cc
	db/memtable_list.cc
	db/memtable_list.h
	db/table_cache.cc
	db/table_cache.h
	db/version_edit.h
	db/version_set.cc
	db/version_set.h
	db/write_batch.cc
	db/write_batch_test.cc
	include/rocksdb/options.h
	util/options.cc
......@@ -6,8 +6,12 @@
executed in high priority thread pool.
## Unreleased (will be relased in 2.8)
* By default, checksums are verified on every read from database
## Unreleased
### Public API changes
* Removed arena.h from public header files.
* By default, checksums are verified on every read from database
## 2.7.0 (01/28/2014)
......
......@@ -6,11 +6,7 @@
INSTALL_PATH ?= $(CURDIR)
#-----------------------------------------------
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
# to switch between compilation modes.
# OPT ?= -DNDEBUG # (A) Production use (optimized mode)
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer
#-----------------------------------------------
# detect what platform we're building on
......@@ -57,6 +53,7 @@ TESTS = \
auto_roll_logger_test \
block_test \
bloom_test \
dynamic_bloom_test \
c_test \
cache_test \
coding_test \
......@@ -75,6 +72,7 @@ TESTS = \
merge_test \
redis_test \
reduce_levels_test \
plain_table_db_test \
simple_table_db_test \
skiplist_test \
stringappend_test \
......@@ -93,6 +91,7 @@ TOOLS = \
db_repl_stress \
blob_store_bench
PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
......@@ -143,11 +142,11 @@ all: $(LIBRARY) $(PROGRAMS)
# Will also generate shared libraries.
release:
$(MAKE) clean
OPT=-DNDEBUG $(MAKE) all -j32
OPT="-DNDEBUG -O2" $(MAKE) all -j32
coverage:
$(MAKE) clean
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
(cd coverage; ./coverage_test.sh)
# Delete intermediate files
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
......@@ -248,6 +247,9 @@ table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJEC
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
......@@ -278,11 +280,14 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
......
......@@ -47,7 +47,6 @@ fi
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
# fi
# fi
set -e
uncommitted_code=`git diff HEAD`
......@@ -55,7 +54,6 @@ uncommitted_code=`git diff HEAD`
# If there's no uncommitted changes, we assume user are doing post-commit
# format check, in which case we'll check the modified lines from latest commit.
# Otherwise, we'll check format of the uncommitted code only.
format_last_commit=0
if [ -z "$uncommitted_code" ]
then
# Check the format of last commit
......
......@@ -44,6 +44,11 @@ $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
tee -a $RECENT_REPORT &&
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
# Unless otherwise specified, we'll not generate html report by default
if [ -z "$HTML" ]; then
exit 0
fi
# Generate the html report. If we cannot find lcov in this machine, we'll simply
# skip this step.
echo "Generating the html coverage report..."
......
......@@ -9,16 +9,16 @@
#include "db/builder.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/merge_helper.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/table.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block_based_table_builder.h"
#include "util/stop_watch.h"
......@@ -26,20 +26,18 @@ namespace rocksdb {
class TableFactory;
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
TableBuilder* NewTableBuilder(const Options& options,
const InternalKeyComparator& internal_comparator,
WritableFile* file,
CompressionType compression_type) {
return options.table_factory->GetTableBuilder(options, file,
compression_type);
return options.table_factory->NewTableBuilder(options, internal_comparator,
file, compression_type);
}
Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
const EnvOptions& soptions,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
const Comparator* user_comparator,
Status BuildTable(const std::string& dbname, Env* env, const Options& options,
const EnvOptions& soptions, TableCache* table_cache,
Iterator* iter, FileMetaData* meta,
const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression) {
......@@ -64,8 +62,8 @@ Status BuildTable(const std::string& dbname,
return s;
}
TableBuilder* builder = GetTableBuilder(options, file.get(),
compression);
TableBuilder* builder =
NewTableBuilder(options, internal_comparator, file.get(), compression);
// the first key is the smallest key
Slice key = iter->key();
......@@ -73,8 +71,8 @@ Status BuildTable(const std::string& dbname,
meta->smallest_seqno = GetInternalKeySeqno(key);
meta->largest_seqno = meta->smallest_seqno;
MergeHelper merge(user_comparator, options.merge_operator.get(),
options.info_log.get(),
MergeHelper merge(internal_comparator.user_comparator(),
options.merge_operator.get(), options.info_log.get(),
true /* internal key corruption is not ok */);
if (purge) {
......@@ -103,8 +101,8 @@ Status BuildTable(const std::string& dbname,
// If the key is the same as the previous key (and it is not the
// first key), then we skip it, since it is an older version.
// Otherwise we output the key and mark it as the "new" previous key.
if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
this_ikey.user_key)) {
if (!is_first_key && !internal_comparator.user_comparator()->Compare(
prev_ikey.user_key, this_ikey.user_key)) {
// seqno within the same key are in decreasing order
assert(this_ikey.sequence < prev_ikey.sequence);
} else {
......@@ -202,10 +200,8 @@ Status BuildTable(const std::string& dbname,
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(),
soptions,
meta->number,
meta->file_size);
Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
internal_comparator, *meta);
s = it->status();
delete it;
}
......
......@@ -24,23 +24,20 @@ class VersionEdit;
class TableBuilder;
class WritableFile;
extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type);
extern TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type);
// Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table.
// If no data is present in *iter, meta->file_size will be set to
// zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
const EnvOptions& soptions,
TableCache* table_cache,
Iterator* iter,
extern Status BuildTable(const std::string& dbname, Env* env,
const Options& options, const EnvOptions& soptions,
TableCache* table_cache, Iterator* iter,
FileMetaData* meta,
const Comparator* user_comparator,
const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression);
......
......@@ -17,6 +17,7 @@
#include "db/internal_stats.h"
#include "db/compaction_picker.h"
#include "db/table_properties_collector.h"
#include "util/autovector.h"
#include "util/hash_skiplist_rep.h"
namespace rocksdb {
......@@ -184,7 +185,7 @@ ColumnFamilyData::~ColumnFamilyData() {
if (mem_ != nullptr) {
delete mem_->Unref();
}
std::vector<MemTable*> to_delete;
autovector<MemTable*> to_delete;
imm_.current()->Unref(&to_delete);
for (MemTable* m : to_delete) {
delete m;
......
......@@ -16,7 +16,7 @@
#include "rocksdb/options.h"
#include "rocksdb/env.h"
#include "db/memtablelist.h"
#include "db/memtable_list.h"
#include "db/write_batch_internal.h"
#include "db/table_cache.h"
......@@ -40,7 +40,7 @@ struct SuperVersion {
// We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction
std::vector<MemTable*> to_delete;
autovector<MemTable*> to_delete;
// should be called outside the mutex
SuperVersion();
......
......@@ -24,6 +24,7 @@
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/statistics.h"
#include "rocksdb/perf_context.h"
#include "port/port.h"
#include "util/bit_set.h"
#include "util/crc32c.h"
......@@ -389,6 +390,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
" this is greater than 0.");
DEFINE_int32(perf_level, 0, "Level of perf collection");
static bool ValidateRateLimit(const char* flagname, double value) {
static constexpr double EPSILON = 1e-10;
if ( value < -EPSILON ) {
......@@ -728,6 +731,7 @@ struct SharedState {
port::Mutex mu;
port::CondVar cv;
int total;
int perf_level;
// Each thread goes through the following states:
// (1) initializing
......@@ -739,7 +743,7 @@ struct SharedState {
long num_done;
bool start;
SharedState() : cv(&mu) { }
SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
};
// Per-thread state for concurrent executions of the same benchmark.
......@@ -847,6 +851,7 @@ class Benchmark {
fprintf(stdout, "Memtablerep: vector\n");
break;
}
fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
......@@ -1202,6 +1207,7 @@ class Benchmark {
}
}
SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
thread->stats.Start(thread->tid);
(arg->bm->*(arg->method))(thread);
thread->stats.Stop();
......
此差异已折叠。
......@@ -7,24 +7,26 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <deque>
#include <set>
#include <utility>
#include <vector>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "db/column_family.h"
#include "db/version_edit.h"
#include "memtable_list.h"
#include "port/port.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
#include "port/port.h"
#include "util/stats_logger.h"
#include "memtablelist.h"
#include "util/autovector.h"
#include "util/stats_logger.h"
#include "db/internal_stats.h"
namespace rocksdb {
......@@ -178,7 +180,7 @@ class DBImpl : public DB {
// needed for CleanupIteratorState
struct DeletionState {
inline bool HaveSomethingToDelete() const {
return all_files.size() ||
return candidate_files.size() ||
sst_delete_files.size() ||
log_delete_files.size();
}
......@@ -186,7 +188,7 @@ class DBImpl : public DB {
// a list of all files that we'll consider deleting
// (every once in a while this is filled up with all files
// in the DB directory)
std::vector<std::string> all_files;
std::vector<std::string> candidate_files;
// the list of all live sst files that cannot be deleted
std::vector<uint64_t> sst_live;
......@@ -198,7 +200,7 @@ class DBImpl : public DB {
std::vector<uint64_t> log_delete_files;
// a list of memtables to be free
std::vector<MemTable *> memtables_to_free;
autovector<MemTable*> memtables_to_free;
SuperVersion* superversion_to_free; // if nullptr nothing to free
......@@ -208,12 +210,10 @@ class DBImpl : public DB {
// that corresponds to the set of files in 'live'.
uint64_t manifest_file_number, log_number, prev_log_number;
explicit DeletionState(const int num_memtables = 0,
bool create_superversion = false) {
explicit DeletionState(bool create_superversion = false) {
manifest_file_number = 0;
log_number = 0;
prev_log_number = 0;
memtables_to_free.reserve(num_memtables);
superversion_to_free = nullptr;
new_superversion = create_superversion ? new SuperVersion() : nullptr;
}
......@@ -232,7 +232,7 @@ class DBImpl : public DB {
};
// Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'all_files'.
// of all files in the filesystem in 'candidate_files'.
// If force == false and the last call was less than
// options_.delete_obsolete_files_period_micros microseconds ago,
// it will not fill up the deletion_state
......@@ -291,7 +291,7 @@ class DBImpl : public DB {
// concurrent flush memtables to storage.
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
VersionEdit* edit);
Status WriteLevel0Table(ColumnFamilyData* cfd, std::vector<MemTable*>& mems,
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
VersionEdit* edit, uint64_t* filenumber);
uint64_t SlowdownAmount(int n, double bottom, double top);
......
......@@ -102,7 +102,8 @@ class DBIter: public Iterator {
virtual void SeekToLast();
private:
void FindNextUserEntry(bool skipping);
inline void FindNextUserEntry(bool skipping);
void FindNextUserEntryInternal(bool skipping);
void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key);
void MergeValuesNewToOld();
......@@ -191,7 +192,15 @@ void DBIter::Next() {
//
// NOTE: In between, saved_key_ can point to a user key that has
// a delete marker
void DBIter::FindNextUserEntry(bool skipping) {
inline void DBIter::FindNextUserEntry(bool skipping) {
StopWatchNano timer(env_, false);
StartPerfTimer(&timer);
FindNextUserEntryInternal(skipping);
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
}
// Actual implementation of DBIter::FindNextUserEntry()
void DBIter::FindNextUserEntryInternal(bool skipping) {
// Loop until we hit an acceptable entry to yield
assert(iter_->Valid());
assert(direction_ == kForward);
......@@ -226,10 +235,7 @@ void DBIter::FindNextUserEntry(bool skipping) {
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
case kTypeColumnFamilyDeletion:
case kTypeColumnFamilyValue:
case kTypeColumnFamilyMerge:
case kTypeLogData:
default:
assert(false);
break;
}
......@@ -429,13 +435,16 @@ void DBIter::FindPrevUserEntry() {
}
void DBIter::Seek(const Slice& target) {
direction_ = kForward;
ClearSavedValue();
saved_key_.clear();
AppendInternalKey(
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->Seek(saved_key_);
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
if (iter_->Valid()) {
direction_ = kForward;
ClearSavedValue();
FindNextUserEntry(false /*not skipping */);
} else {
valid_ = false;
......@@ -445,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() {
direction_ = kForward;
ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToFirst();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */);
} else {
......@@ -464,7 +476,10 @@ void DBIter::SeekToLast() {
direction_ = kReverse;
ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToLast();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
FindPrevUserEntry();
}
......
此差异已折叠。
......@@ -6,9 +6,9 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include <stdio.h>
#include "db/dbformat.h"
#include "port/port.h"
#include "util/coding.h"
#include "util/perf_context_imp.h"
......@@ -72,6 +72,28 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
return r;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(a.user_key, b.user_key);
BumpPerfCount(&perf_context.user_key_comparison_count);
if (r == 0) {
if (a.sequence > b.sequence) {
r = -1;
} else if (a.sequence < b.sequence) {
r = +1;
} else if (a.type > b.type) {
r = -1;
} else if (a.type < b.type) {
r = +1;
}
}
return r;
}
void InternalKeyComparator::FindShortestSeparator(
std::string* start,
const Slice& limit) const {
......
......@@ -25,7 +25,9 @@ class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
// The highest bit of the value type needs to be reserved to SST tables
// for them to do more flexible encoding.
enum ValueType : unsigned char {
kTypeDeletion = 0x0,
kTypeValue = 0x1,
kTypeMerge = 0x2,
......@@ -33,7 +35,9 @@ enum ValueType {
kTypeColumnFamilyDeletion = 0x4,
kTypeColumnFamilyValue = 0x5,
kTypeColumnFamilyMerge = 0x6,
kMaxValue = 0x7F
};
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
......@@ -99,6 +103,7 @@ class InternalKeyComparator : public Comparator {
name_("rocksdb.InternalKeyComparator:" +
std::string(user_comparator_->Name())) {
}
virtual ~InternalKeyComparator() {}
virtual const char* Name() const;
virtual int Compare(const Slice& a, const Slice& b) const;
......@@ -110,6 +115,7 @@ class InternalKeyComparator : public Comparator {
const Comparator* user_comparator() const { return user_comparator_; }
int Compare(const InternalKey& a, const InternalKey& b) const;
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
};
// Filter policy wrapper that converts from internal keys to user keys
......@@ -166,6 +172,7 @@ inline bool ParseInternalKey(const Slice& internal_key,
unsigned char c = num & 0xff;
result->sequence = num >> 8;
result->type = static_cast<ValueType>(c);
assert(result->type <= ValueType::kMaxValue);
result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
}
......
......@@ -17,7 +17,6 @@ namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
......
......@@ -17,10 +17,14 @@
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/slice_transform.h"
#include "util/arena.h"
#include "util/coding.h"
#include "util/mutexlock.h"
#include "util/murmurhash.h"
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
#include "util/statistics.h"
#include "util/stop_watch.h"
namespace std {
template <>
......@@ -37,9 +41,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
const ColumnFamilyOptions& options)
: comparator_(cmp),
refs_(0),
arena_impl_(options.arena_block_size),
table_(options.memtable_factory->CreateMemTableRep(comparator_,
&arena_impl_)),
arena_(options.arena_block_size),
table_(options.memtable_factory->CreateMemTableRep(comparator_, &arena_)),
flush_in_progress_(false),
flush_completed_(false),
file_number_(0),
......@@ -47,23 +50,36 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
mem_next_logfile_number_(0),
mem_logfile_number_(0),
locks_(options.inplace_update_support ? options.inplace_update_num_locks
: 0) {}
: 0),
prefix_extractor_(options.prefix_extractor) {
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
options.memtable_prefix_bloom_probes));
}
}
MemTable::~MemTable() {
assert(refs_ == 0);
}
size_t MemTable::ApproximateMemoryUsage() {
return arena_impl_.ApproximateMemoryUsage() +
table_->ApproximateMemoryUsage();
return arena_.ApproximateMemoryUsage() + table_->ApproximateMemoryUsage();
}
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
const char* prefix_len_key2) const {
// Internal keys are encoded as length-prefixed strings.
Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
return comparator.Compare(k1, k2);
}
int MemTable::KeyComparator::operator()(const char* prefix_len_key,
const Slice& key)
const {
// Internal keys are encoded as length-prefixed strings.
Slice a = GetLengthPrefixedSlice(aptr);
Slice b = GetLengthPrefixedSlice(bptr);
return comparator.Compare(a, b);
Slice a = GetLengthPrefixedSlice(prefix_len_key);
return comparator.Compare(a, key);
}
Slice MemTableRep::UserKey(const char* key) const {
......@@ -74,7 +90,7 @@ Slice MemTableRep::UserKey(const char* key) const {
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
static const char* EncodeKey(std::string* scratch, const Slice& target) {
const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
scratch->append(target.data(), target.size());
......@@ -83,27 +99,53 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator: public Iterator {
public:
MemTableIterator(MemTableRep* table, const ReadOptions& options)
: iter_() {
MemTableIterator(const MemTable& mem, const ReadOptions& options)
: mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) {
if (options.prefix) {
iter_.reset(table->GetPrefixIterator(*options.prefix));
iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix));
} else if (options.prefix_seek) {
iter_.reset(table->GetDynamicPrefixIterator());
dynamic_prefix_seek_ = true;
iter_.reset(mem_.table_->GetDynamicPrefixIterator());
} else {
iter_.reset(table->GetIterator());
iter_.reset(mem_.table_->GetIterator());
}
}
virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
virtual void SeekToFirst() { iter_->SeekToFirst(); }
virtual void SeekToLast() { iter_->SeekToLast(); }
virtual void Next() { iter_->Next(); }
virtual void Prev() { iter_->Prev(); }
virtual bool Valid() const { return valid_; }
virtual void Seek(const Slice& k) {
if (dynamic_prefix_seek_ && mem_.prefix_bloom_ &&
!mem_.prefix_bloom_->MayContain(
mem_.prefix_extractor_->Transform(ExtractUserKey(k)))) {
valid_ = false;
return;
}
iter_->Seek(k, nullptr);
valid_ = iter_->Valid();
}
virtual void SeekToFirst() {
iter_->SeekToFirst();
valid_ = iter_->Valid();
}
virtual void SeekToLast() {
iter_->SeekToLast();
valid_ = iter_->Valid();
}
virtual void Next() {
assert(Valid());
iter_->Next();
valid_ = iter_->Valid();
}
virtual void Prev() {
assert(Valid());
iter_->Prev();
valid_ = iter_->Valid();
}
virtual Slice key() const {
assert(Valid());
return GetLengthPrefixedSlice(iter_->key());
}
virtual Slice value() const {
assert(Valid());
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
}
......@@ -111,8 +153,10 @@ class MemTableIterator: public Iterator {
virtual Status status() const { return Status::OK(); }
private:
std::unique_ptr<MemTableRep::Iterator> iter_;
std::string tmp_; // For passing to EncodeKey
const MemTable& mem_;
std::shared_ptr<MemTableRep::Iterator> iter_;
bool dynamic_prefix_seek_;
bool valid_;
// No copying allowed
MemTableIterator(const MemTableIterator&);
......@@ -120,7 +164,7 @@ class MemTableIterator: public Iterator {
};
Iterator* MemTable::NewIterator(const ReadOptions& options) {
return new MemTableIterator(table_.get(), options);
return new MemTableIterator(*this, options);
}
port::RWMutex* MemTable::GetLock(const Slice& key) {
......@@ -128,7 +172,7 @@ port::RWMutex* MemTable::GetLock(const Slice& key) {
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key,
const Slice& key, /* user key */
const Slice& value) {
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
......@@ -141,7 +185,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
char* buf = arena_impl_.Allocate(encoded_len);
char* buf = arena_.Allocate(encoded_len);
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
......@@ -152,6 +196,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
table_->Insert(buf);
if (prefix_bloom_) {
assert(prefix_extractor_);
prefix_bloom_->Add(prefix_extractor_->Transform(key));
}
// The first sequence number inserted into the memtable
assert(first_seqno_ == 0 || s > first_seqno_);
if (first_seqno_ == 0) {
......@@ -161,17 +210,28 @@ void MemTable::Add(SequenceNumber s, ValueType type,
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options) {
Slice memkey = key.memtable_key();
std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key()));
iter->Seek(memkey.data());
StopWatchNano memtable_get_timer(options.env, false);
StartPerfTimer(&memtable_get_timer);
Slice mem_key = key.memtable_key();
Slice user_key = key.user_key();
std::unique_ptr<MemTableRep::Iterator> iter;
if (prefix_bloom_ &&
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
// iter is null if prefix bloom says the key does not exist
} else {
iter.reset(table_->GetIterator(user_key));
iter->Seek(key.internal_key(), mem_key.data());
}
bool merge_in_progress = s->IsMergeInProgress();
auto merge_operator = options.merge_operator.get();
auto logger = options.info_log;
std::string merge_result;
for (; iter->Valid(); iter->Next()) {
bool found_final_value = false;
for (; !found_final_value && iter && iter->Valid(); iter->Next()) {
// entry format is:
// klength varint32
// userkey char[klength-8]
......@@ -182,7 +242,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length;
uint32_t key_length = 0;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
......@@ -209,7 +269,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (options.inplace_update_support) {
GetLock(key.user_key())->Unlock();
}
return true;
found_final_value = true;
break;
}
case kTypeDeletion: {
if (merge_in_progress) {
......@@ -224,7 +285,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
} else {
*s = Status::NotFound();
}
return true;
found_final_value = true;
break;
}
case kTypeMerge: {
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
......@@ -244,10 +306,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
}
break;
}
case kTypeColumnFamilyDeletion:
case kTypeColumnFamilyValue:
case kTypeColumnFamilyMerge:
case kTypeLogData:
default:
assert(false);
break;
}
......@@ -259,25 +318,27 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// No change to value, since we have not yet found a Put/Delete
if (merge_in_progress) {
if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress("");
}
return false;
BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
BumpPerfCount(&perf_context.get_from_memtable_count);
return found_final_value;
}
bool MemTable::Update(SequenceNumber seq, ValueType type,
void MemTable::Update(SequenceNumber seq,
const Slice& key,
const Slice& value) {
LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key();
Slice mem_key = lkey.memtable_key();
std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(lkey.user_key()));
iter->Seek(memkey.data());
table_->GetIterator(lkey.user_key()));
iter->Seek(lkey.internal_key(), mem_key.data());
if (iter->Valid()) {
// entry format is:
// klength varint32
// key_length varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
......@@ -286,7 +347,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length;
uint32_t key_length = 0;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
......@@ -294,32 +355,105 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
uint32_t vlength;
GetVarint32Ptr(key_ptr + key_length,
key_ptr + key_length+5, &vlength);
// Update value, if newValue size <= curValue size
if (value.size() <= vlength) {
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
uint32_t prev_size = prev_value.size();
uint32_t new_size = value.size();
// Update value, if new value size <= previous value size
if (new_size <= prev_size ) {
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
value.size());
new_size);
WriteLock wl(GetLock(lkey.user_key()));
memcpy(p, value.data(), value.size());
assert((unsigned)((p + value.size()) - entry) ==
(unsigned)(VarintLength(key_length) + key_length +
VarintLength(value.size()) + value.size()));
return true;
return;
}
}
default:
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
// then we probably don't have enough space to update in-place
// Maybe do something later
// Return false, and do normal Add()
return false;
// we don't have enough space for update inplace
Add(seq, kTypeValue, key, value);
return;
}
}
}
// Key doesn't exist
// key doesn't exist
Add(seq, kTypeValue, key, value);
}
bool MemTable::UpdateCallback(SequenceNumber seq,
const Slice& key,
const Slice& delta,
const Options& options) {
LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(lkey.user_key()));
iter->Seek(lkey.internal_key(), memkey.data());
if (iter->Valid()) {
// entry format is:
// key_length varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length = 0;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
uint32_t prev_size = prev_value.size();
char* prev_buffer = const_cast<char*>(prev_value.data());
uint32_t new_prev_size = prev_size;
std::string str_value;
WriteLock wl(GetLock(lkey.user_key()));
auto status = options.inplace_callback(prev_buffer, &new_prev_size,
delta, &str_value);
if (status == UpdateStatus::UPDATED_INPLACE) {
// Value already updated by callback.
assert(new_prev_size <= prev_size);
if (new_prev_size < prev_size) {
// overwrite the new prev_size
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
new_prev_size);
if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
// shift the value buffer as well.
memcpy(p, prev_buffer, new_prev_size);
}
}
RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
return true;
} else if (status == UpdateStatus::UPDATED) {
Add(seq, kTypeValue, key, Slice(str_value));
RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
return true;
} else if (status == UpdateStatus::UPDATE_FAILED) {
// No action required. Return.
return true;
}
}
default:
break;
}
}
}
// If the latest value is not kTypeValue
// or key doesn't exist
return false;
}
......@@ -331,13 +465,13 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
// The iterator only needs to be ordered within the same user key.
std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key()));
iter->Seek(memkey.data());
iter->Seek(key.internal_key(), memkey.data());
size_t num_successive_merges = 0;
for (; iter->Valid(); iter->Next()) {
const char* entry = iter->key();
uint32_t key_length;
uint32_t key_length = 0;
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (!comparator_.comparator.user_comparator()->Compare(
Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
......
......@@ -16,7 +16,8 @@
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/memtablerep.h"
#include "util/arena_impl.h"
#include "util/arena.h"
#include "util/dynamic_bloom.h"
namespace rocksdb {
......@@ -29,7 +30,10 @@ class MemTable {
struct KeyComparator : public MemTableRep::KeyComparator {
const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
virtual int operator()(const char* a, const char* b) const;
virtual int operator()(const char* prefix_len_key1,
const char* prefix_len_key2) const;
virtual int operator()(const char* prefix_len_key,
const Slice& key) const override;
};
// MemTables are reference counted. The initial reference count
......@@ -94,16 +98,31 @@ class MemTable {
bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options);
// Update the value and return status ok,
// if key exists in current memtable
// if new sizeof(new_value) <= sizeof(old_value) &&
// old_value for that key is a put i.e. kTypeValue
// else return false, and status - NotUpdatable()
// else return false, and status - NotFound()
bool Update(SequenceNumber seq, ValueType type,
// Attempts to update the new_value inplace, else does normal Add
// Pseudocode
// if key exists in current memtable && prev_value is of type kTypeValue
// if new sizeof(new_value) <= sizeof(prev_value)
// update inplace
// else add(key, new_value)
// else add(key, new_value)
void Update(SequenceNumber seq,
const Slice& key,
const Slice& value);
// If prev_value for key exits, attempts to update it inplace.
// else returns false
// Pseudocode
// if key exists in current memtable && prev_value is of type kTypeValue
// new_value = delta(prev_value)
// if sizeof(new_value) <= sizeof(prev_value)
// update inplace
// else add(key, new_value)
// else return false
bool UpdateCallback(SequenceNumber seq,
const Slice& key,
const Slice& delta,
const Options& options);
// Returns the number of successive merge entries starting from the newest
// entry for the key up to the last non-merge entry or last entry for the
// key in the memtable.
......@@ -142,7 +161,7 @@ class MemTable {
KeyComparator comparator_;
int refs_;
ArenaImpl arena_impl_;
Arena arena_;
unique_ptr<MemTableRep> table_;
// These are used to manage memtable flushes to storage
......@@ -150,7 +169,7 @@ class MemTable {
bool flush_completed_; // finished the flush
uint64_t file_number_; // filled up after flush is complete
// The udpates to be applied to the transaction log when this
// The updates to be applied to the transaction log when this
// memtable is flushed to storage.
VersionEdit edit_;
......@@ -173,6 +192,11 @@ class MemTable {
// Get the lock associated for the key
port::RWMutex* GetLock(const Slice& key);
const SliceTransform* const prefix_extractor_;
std::unique_ptr<DynamicBloom> prefix_bloom_;
};
extern const char* EncodeKey(std::string* scratch, const Slice& target);
} // namespace rocksdb
......@@ -3,7 +3,7 @@
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "db/memtablelist.h"
#include "db/memtable_list.h"
#include <string>
#include "rocksdb/db.h"
......@@ -31,7 +31,7 @@ MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
void MemTableListVersion::Ref() { ++refs_; }
void MemTableListVersion::Unref(std::vector<MemTable*>* to_delete) {
void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
assert(refs_ >= 1);
--refs_;
if (refs_ == 0) {
......@@ -103,7 +103,7 @@ bool MemTableList::IsFlushPending() {
}
// Returns the memtables that need to be flushed.
void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
const auto& memlist = current_->memlist_;
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
MemTable* m = *it;
......@@ -113,18 +113,18 @@ void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
if (num_flush_not_started_ == 0) {
imm_flush_needed.Release_Store(nullptr);
}
m->flush_in_progress_ = true; // flushing will start very soon
m->flush_in_progress_ = true; // flushing will start very soon
ret->push_back(m);
}
}
flush_requested_ = false; // start-flush request is complete
flush_requested_ = false; // start-flush request is complete
}
// Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults(
ColumnFamilyData* cfd, const std::vector<MemTable*>& mems, VersionSet* vset,
ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number,
std::set<uint64_t>& pending_outputs, std::vector<MemTable*>* to_delete,
std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
Directory* db_directory) {
mu->AssertHeld();
......
......@@ -3,18 +3,25 @@
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <string>
#include <list>
#include <vector>
#include <set>
#include <deque>
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/iterator.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/skiplist.h"
#include "db/memtable.h"
#include "rocksdb/db.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "util/autovector.h"
namespace rocksdb {
......@@ -30,7 +37,7 @@ class MemTableListVersion {
explicit MemTableListVersion(MemTableListVersion* old = nullptr);
void Ref();
void Unref(std::vector<MemTable*>* to_delete = nullptr);
void Unref(autovector<MemTable*>* to_delete = nullptr);
int size() const;
......@@ -89,14 +96,14 @@ class MemTableList {
// Returns the earliest memtables that needs to be flushed. The returned
// memtables are guaranteed to be in the ascending order of created time.
void PickMemtablesToFlush(std::vector<MemTable*>* mems);
void PickMemtablesToFlush(autovector<MemTable*>* mems);
// Commit a successful flush in the manifest file
Status InstallMemtableFlushResults(
ColumnFamilyData* cfd, const std::vector<MemTable*>& m, VersionSet* vset,
ColumnFamilyData* cfd, const autovector<MemTable*>& m, VersionSet* vset,
Status flushStatus, port::Mutex* mu, Logger* info_log,
uint64_t file_number, std::set<uint64_t>& pending_outputs,
std::vector<MemTable*>* to_delete, Directory* db_directory);
autovector<MemTable*>* to_delete, Directory* db_directory);
// New memtables are inserted at the front of the list.
// Takes ownership of the referenced held on *m by the caller of Add().
......
......@@ -174,6 +174,13 @@ void ProfileKeyComparison() {
HistogramImpl hist_put;
HistogramImpl hist_get;
HistogramImpl hist_get_snapshot;
HistogramImpl hist_get_memtable;
HistogramImpl hist_get_post_process;
HistogramImpl hist_num_memtable_checked;
HistogramImpl hist_write_pre_post;
HistogramImpl hist_write_wal_time;
HistogramImpl hist_write_memtable_time;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
......@@ -192,16 +199,37 @@ void ProfileKeyComparison() {
perf_context.Reset();
db->Put(write_options, key, value);
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
hist_write_wal_time.Add(perf_context.write_wal_time);
hist_write_memtable_time.Add(perf_context.write_memtable_time);
hist_put.Add(perf_context.user_key_comparison_count);
perf_context.Reset();
db->Get(read_options, key, &value);
hist_get_snapshot.Add(perf_context.get_snapshot_time);
hist_get_memtable.Add(perf_context.get_from_memtable_time);
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_get_post_process.Add(perf_context.get_post_process_time);
hist_get.Add(perf_context.user_key_comparison_count);
}
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
<< "Get uesr key comparison: \n" << hist_get.ToString();
std::cout << "Put(): Pre and Post Process Time: \n"
<< hist_write_pre_post.ToString()
<< " Writing WAL time: \n"
<< hist_write_wal_time.ToString() << "\n"
<< " Writing Mem Table time: \n"
<< hist_write_memtable_time.ToString() << "\n";
std::cout << "Get(): Time to get snapshot: \n"
<< hist_get_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_get_memtable.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n"
<< hist_get_post_process.ToString() << "\n";
}
TEST(PerfContextTest, KeyComparisonCount) {
......@@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
db->Put(write_options, key, value);
auto put_time = timer.ElapsedNanos();
hist_put_time.Add(put_time);
hist_wal_time.Add(perf_context.wal_write_time);
hist_time_diff.Add(put_time - perf_context.wal_write_time);
hist_wal_time.Add(perf_context.write_wal_time);
hist_time_diff.Add(put_time - perf_context.write_wal_time);
}
std::cout << "Put time:\n" << hist_put_time.ToString()
......
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <set>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "table/plain_table_factory.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "utilities/merge_operators.h"
using std::unique_ptr;
namespace rocksdb {
class PlainTableDBTest {
protected:
private:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
static std::unique_ptr<const SliceTransform> prefix_transform;
public:
PlainTableDBTest() : env_(Env::Default()) {
dbname_ = test::TmpDir() + "/plain_table_db_test";
ASSERT_OK(DestroyDB(dbname_, Options()));
db_ = nullptr;
Reopen();
}
~PlainTableDBTest() {
delete db_;
ASSERT_OK(DestroyDB(dbname_, Options()));
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
options.table_factory.reset(new PlainTableFactory(16, 2, 0.8));
options.prefix_extractor = prefix_transform.get();
options.allow_mmap_reads = true;
return options;
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void Close() {
delete db_;
db_ = nullptr;
}
void DestroyAndReopen(Options* options = nullptr) {
//Destroy using last options
Destroy(&last_options_);
ASSERT_OK(TryReopen(options));
}
void Destroy(Options* options) {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, *options));
}
Status PureReopen(Options* options, DB** db) {
return DB::Open(*options, dbname_, db);
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opts;
if (options != nullptr) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const Slice& k, const Slice& v) {
return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
int NumTableFilesAtLevel(int level) {
std::string property;
ASSERT_TRUE(
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
&property));
return atoi(property.c_str());
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
};
std::unique_ptr<const SliceTransform> PlainTableDBTest::prefix_transform(
NewFixedPrefixTransform(8));
TEST(PlainTableDBTest, Empty) {
ASSERT_TRUE(dbfull() != nullptr);
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
}
TEST(PlainTableDBTest, ReadWrite) {
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_EQ("v1", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(PlainTableDBTest, Flush) {
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(PlainTableDBTest, Iterator) {
ASSERT_OK(Put("1000000000foo002", "v_2"));
ASSERT_OK(Put("0000000000000bar", "random"));
ASSERT_OK(Put("1000000000foo001", "v1"));
ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo003", "v__3"));
ASSERT_OK(Put("1000000000foo004", "v__4"));
ASSERT_OK(Put("1000000000foo005", "v__5"));
ASSERT_OK(Put("1000000000foo007", "v__7"));
ASSERT_OK(Put("1000000000foo008", "v__8"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("1000000000foo001"));
ASSERT_EQ("v__3", Get("1000000000foo003"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("1000000000foo001");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo002", iter->key().ToString());
ASSERT_EQ("v_2", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo003", iter->key().ToString());
ASSERT_EQ("v__3", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo004", iter->key().ToString());
ASSERT_EQ("v__4", iter->value().ToString());
iter->Seek("3000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
ASSERT_EQ("bar_v", iter->value().ToString());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Seek("1000000000foo005");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo006");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo007", iter->key().ToString());
ASSERT_EQ("v__7", iter->value().ToString());
iter->Seek("1000000000foo008");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString());
iter->Seek("1000000000foo009");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
delete iter;
}
TEST(PlainTableDBTest, Flush2) {
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("1000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key_______%06d", i);
return std::string(buf);
}
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
TEST(PlainTableDBTest, CompactionTrigger) {
Options options = CurrentOptions();
options.write_buffer_size = 100 << 10; //100KB
options.num_levels = 3;
options.max_mem_compaction_level = 0;
options.level0_file_num_compaction_trigger = 3;
Reopen(&options);
Random rnd(301);
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
num++) {
std::vector<std::string> values;
// Write 120KB (12 values, each 10K)
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
}
//generate one more file in level-0, and should trigger level-0 compaction
std::vector<std::string> values;
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}
......@@ -16,11 +16,15 @@ DEFINE_bool(trigger_deadlock, false,
DEFINE_uint64(bucket_count, 100000, "number of buckets");
DEFINE_uint64(num_locks, 10001, "number of locks");
DEFINE_bool(random_prefix, false, "randomize prefix");
DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
DEFINE_int64(write_buffer_size, 1000000000, "");
DEFINE_int64(max_write_buffer_number, 8, "");
DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
DEFINE_int64(write_buffer_size, 33554432, "");
DEFINE_int64(max_write_buffer_number, 2, "");
DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
DEFINE_int32(skiplist_height, 4, "");
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
DEFINE_int32(value_size, 40, "");
// Path to the database on file system
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
......@@ -104,218 +108,265 @@ class PrefixTest {
options.min_write_buffer_number_to_merge =
FLAGS_min_write_buffer_number_to_merge;
options.comparator = new TestKeyComparator();
if (FLAGS_use_prefix_hash_memtable) {
auto prefix_extractor = NewFixedPrefixTransform(8);
options.prefix_extractor = prefix_extractor;
options.memtable_factory.reset(NewHashSkipListRepFactory(
prefix_extractor, FLAGS_bucket_count));
}
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s);
return std::shared_ptr<DB>(db);
}
bool NextOptions() {
// skip some options
option_config_++;
if (option_config_ < kEnd) {
auto prefix_extractor = NewFixedPrefixTransform(8);
options.prefix_extractor = prefix_extractor;
switch(option_config_) {
case kHashSkipList:
options.memtable_factory.reset(
NewHashSkipListRepFactory(options.prefix_extractor,
FLAGS_bucket_count,
FLAGS_skiplist_height));
return true;
case kHashLinkList:
options.memtable_factory.reset(
NewHashLinkListRepFactory(options.prefix_extractor,
FLAGS_bucket_count));
return true;
default:
return false;
}
}
return false;
}
PrefixTest() : option_config_(kBegin) {
options.comparator = new TestKeyComparator();
}
~PrefixTest() {
delete options.comparator;
}
protected:
enum OptionConfig {
kBegin,
kHashSkipList,
kHashLinkList,
kEnd
};
int option_config_;
Options options;
};
TEST(PrefixTest, DynamicPrefixIterator) {
while (NextOptions()) {
std::cout << "*** Mem table: " << options.memtable_factory->Name()
<< std::endl;
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
HistogramImpl hist_put_time;
HistogramImpl hist_put_comparison;
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
// insert x random prefix, each with y continuous element.
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
// insert x random prefix, each with y continuous element.
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
Slice key = TestKeyToSlice(test_key);
std::string value(FLAGS_value_size, 0);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(sorted);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
ASSERT_OK(db->Put(write_options, key, value));
hist_put_time.Add(timer.ElapsedNanos());
hist_put_comparison.Add(perf_context.user_key_comparison_count);
}
}
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
<< "Put time: \n" << hist_put_time.ToString();
ASSERT_OK(db->Put(write_options, key, value));
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
if (FLAGS_use_prefix_hash_memtable) {
read_options.prefix_seek = true;
}
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
for (auto prefix : prefixes) {
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
if (FLAGS_use_prefix_hash_memtable) {
read_options.prefix_seek = true;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
for (auto prefix : prefixes) {
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
}
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 100;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
TEST(PrefixTest, PrefixHash) {
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 10000;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
}
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
TEST(PrefixTest, PrefixHash) {
while (NextOptions()) {
std::cout << "*** Mem table: " << options.memtable_factory->Name()
<< std::endl;
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
// insert x random prefix, each with y continuous element.
HistogramImpl hist_put_time;
HistogramImpl hist_put_comparison;
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
// insert x random prefix, each with y continuous element.
HistogramImpl hist_put_time;
HistogramImpl hist_put_comparison;
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(sorted);
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
ASSERT_OK(db->Put(write_options, key, value));
hist_put_time.Add(timer.ElapsedNanos());
hist_put_comparison.Add(perf_context.user_key_comparison_count);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(sorted);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
ASSERT_OK(db->Put(write_options, key, value));
hist_put_time.Add(timer.ElapsedNanos());
hist_put_comparison.Add(perf_context.user_key_comparison_count);
}
}
}
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
<< "Put time: \n" << hist_put_time.ToString();
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
<< "Put time: \n" << hist_put_time.ToString();
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
for (auto prefix : prefixes) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
for (auto prefix : prefixes) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
Slice key_prefix;
if (FLAGS_use_prefix_hash_memtable) {
key_prefix = options.prefix_extractor->Transform(key);
read_options.prefix = &key_prefix;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
Slice key_prefix;
if (FLAGS_use_prefix_hash_memtable) {
key_prefix = options.prefix_extractor->Transform(key);
read_options.prefix = &key_prefix;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
}
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 100;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 100;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
if (FLAGS_use_prefix_hash_memtable) {
Slice key_prefix = options.prefix_extractor->Transform(key);
read_options.prefix = &key_prefix;
if (FLAGS_use_prefix_hash_memtable) {
Slice key_prefix = options.prefix_extractor->Transform(key);
read_options.prefix = &key_prefix;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
}
......
......@@ -231,10 +231,8 @@ class Repairer {
FileMetaData meta;
meta.number = next_file_number_++;
Iterator* iter = mem->NewIterator();
status = BuildTable(dbname_, env_, options_, storage_options_,
table_cache_, iter, &meta,
icmp_.user_comparator(), 0, 0,
kNoCompression);
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
iter, &meta, icmp_, 0, 0, kNoCompression);
delete iter;
delete mem->Unref();
delete cf_mems_default;
......@@ -275,8 +273,9 @@ class Repairer {
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
ReadOptions(), storage_options_, icmp_, dummy_meta);
bool empty = true;
ParsedInternalKey parsed;
t->min_sequence = 0;
......
......@@ -22,6 +22,8 @@
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/table_builder.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
......@@ -31,6 +33,7 @@
using std::unique_ptr;
// IS THIS FILE STILL NEEDED?
namespace rocksdb {
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
......@@ -84,15 +87,13 @@ public:
Iterator* NewIterator(const ReadOptions&) override;
Status Get(
const ReadOptions&, const Slice& key, void* arg,
bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
void (*mark_key_may_exist)(void*) = nullptr) override;
Status Get(const ReadOptions&, const Slice& key, void* arg,
bool (*handle_result)(void* arg, const ParsedInternalKey& k,
const Slice& v, bool),
void (*mark_key_may_exist)(void*) = nullptr) override;
uint64_t ApproximateOffsetOf(const Slice& key) override;
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
void SetupForCompaction() override;
TableProperties& GetTableProperties() override;
......@@ -244,7 +245,8 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
return s;
}
int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
InternalKeyComparator ikc(rep_->options.comparator);
int compare_result = ikc.Compare(tmp_slice, target);
if (compare_result < 0) {
if (left == right) {
......@@ -279,14 +281,20 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
return s;
}
Status SimpleTableReader::Get(
const ReadOptions& options, const Slice& k, void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool),
void (*mark_key_may_exist)(void*)) {
Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&, bool),
void (*mark_key_may_exist)(void*)) {
Status s;
SimpleTableIterator* iter = new SimpleTableIterator(this);
for (iter->Seek(k); iter->Valid(); iter->Next()) {
if (!(*saver)(arg, iter->key(), iter->value(), true)) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(iter->key(), &parsed_key)) {
return Status::Corruption(Slice());
}
if (!(*saver)(arg, parsed_key, iter->value(), true)) {
break;
}
}
......@@ -295,11 +303,6 @@ Status SimpleTableReader::Get(
return s;
}
bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
const Slice& key) {
return false;
}
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
return 0;
}
......@@ -540,27 +543,30 @@ public:
const char* Name() const override {
return "SimpleTable";
}
Status GetTableReader(const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file,
uint64_t file_size,
Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const;
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
TableBuilder* NewTableBuilder(const Options& options,
const InternalKeyComparator& internal_key,
WritableFile* file,
CompressionType compression_type) const;
};
Status SimpleTableFactory::GetTableReader(
Status SimpleTableFactory::NewTableReader(
const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const {
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
table_reader);
}
TableBuilder* SimpleTableFactory::GetTableBuilder(
const Options& options, WritableFile* file,
CompressionType compression_type) const {
TableBuilder* SimpleTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_key,
WritableFile* file, CompressionType compression_type) const {
return new SimpleTableBuilder(options, file, compression_type);
}
......
......@@ -34,8 +34,8 @@
#include <assert.h>
#include <stdlib.h>
#include "port/port.h"
#include "util/arena.h"
#include "util/random.h"
#include "rocksdb/arena.h"
namespace rocksdb {
......@@ -48,7 +48,8 @@ class SkipList {
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena);
explicit SkipList(Comparator cmp, Arena* arena,
int32_t max_height = 12, int32_t branching_factor = 4);
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
......@@ -102,7 +103,8 @@ class SkipList {
};
private:
enum { kMaxHeight = 12 };
const int32_t kMaxHeight_;
const int32_t kBranching_;
// Immutable after construction
Comparator const compare_;
......@@ -115,8 +117,8 @@ class SkipList {
port::AtomicPointer max_height_; // Height of the entire list
// Used for optimizing sequential insert patterns
Node* prev_[kMaxHeight];
int prev_height_;
Node** prev_;
int32_t prev_height_;
inline int GetMaxHeight() const {
return static_cast<int>(
......@@ -258,13 +260,12 @@ inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
template<typename Key, class Comparator>
int SkipList<Key,Comparator>::RandomHeight() {
// Increase height with probability 1 in kBranching
static const unsigned int kBranching = 4;
int height = 1;
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
height++;
}
assert(height > 0);
assert(height <= kMaxHeight);
assert(height <= kMaxHeight_);
return height;
}
......@@ -354,14 +355,24 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
}
template<typename Key, class Comparator>
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
: compare_(cmp),
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena,
int32_t max_height,
int32_t branching_factor)
: kMaxHeight_(max_height),
kBranching_(branching_factor),
compare_(cmp),
arena_(arena),
head_(NewNode(0 /* any key will do */, kMaxHeight)),
head_(NewNode(0 /* any key will do */, max_height)),
max_height_(reinterpret_cast<void*>(1)),
prev_height_(1),
rnd_(0xdeadbeef) {
for (int i = 0; i < kMaxHeight; i++) {
assert(kMaxHeight_ > 0);
assert(kBranching_ > 0);
// Allocate the prev_ Node* array, directly from the passed-in arena.
// prev_ does not need to be freed, as its life cycle is tied up with
// the arena as a whole.
prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
for (int i = 0; i < kMaxHeight_; i++) {
head_->SetNext(i, nullptr);
prev_[i] = head_;
}
......
......@@ -10,7 +10,7 @@
#include "db/skiplist.h"
#include <set>
#include "rocksdb/env.h"
#include "util/arena_impl.h"
#include "util/arena.h"
#include "util/hash.h"
#include "util/random.h"
#include "util/testharness.h"
......@@ -34,9 +34,9 @@ struct TestComparator {
class SkipTest { };
TEST(SkipTest, Empty) {
ArenaImpl arena_impl;
Arena arena;
TestComparator cmp;
SkipList<Key, TestComparator> list(cmp, &arena_impl);
SkipList<Key, TestComparator> list(cmp, &arena);
ASSERT_TRUE(!list.Contains(10));
SkipList<Key, TestComparator>::Iterator iter(&list);
......@@ -54,9 +54,9 @@ TEST(SkipTest, InsertAndLookup) {
const int R = 5000;
Random rnd(1000);
std::set<Key> keys;
ArenaImpl arena_impl;
Arena arena;
TestComparator cmp;
SkipList<Key, TestComparator> list(cmp, &arena_impl);
SkipList<Key, TestComparator> list(cmp, &arena);
for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R;
if (keys.insert(key).second) {
......@@ -209,14 +209,14 @@ class ConcurrentTest {
// Current state of the test
State current_;
ArenaImpl arena_impl_;
Arena arena_;
// SkipList is not protected by mu_. We just use a single writer
// thread to modify it.
SkipList<Key, TestComparator> list_;
public:
ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
ConcurrentTest() : list_(TestComparator(), &arena_) {}
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
......
......@@ -10,9 +10,10 @@
#include "db/table_cache.h"
#include "db/filename.h"
#include "db/version_edit.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table.h"
#include "table/table_reader.h"
#include "util/coding.h"
#include "util/stop_watch.h"
......@@ -34,7 +35,6 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) {
sizeof(*file_number));
}
// TODO(icanadi) Options -> DBOptions
TableCache::TableCache(const std::string& dbname, const Options* options,
const EnvOptions& storage_options, Cache* const cache)
: env_(options->env),
......@@ -46,7 +46,16 @@ TableCache::TableCache(const std::string& dbname, const Options* options,
TableCache::~TableCache() {
}
TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
return reinterpret_cast<TableReader*>(cache_->Value(handle));
}
void TableCache::ReleaseHandle(Cache::Handle* handle) {
cache_->Release(handle);
}
Status TableCache::FindTable(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
uint64_t file_number, uint64_t file_size,
Cache::Handle** handle, bool* table_io,
const bool no_io) {
......@@ -70,8 +79,9 @@ Status TableCache::FindTable(const EnvOptions& toptions,
file->Hint(RandomAccessFile::RANDOM);
}
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
s = options_->table_factory->GetTableReader(
*options_, toptions, std::move(file), file_size, &table_reader);
s = options_->table_factory->NewTableReader(
*options_, toptions, internal_comparator, std::move(file), file_size,
&table_reader);
}
if (!s.ok()) {
......@@ -89,25 +99,28 @@ Status TableCache::FindTable(const EnvOptions& toptions,
Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
uint64_t file_number,
uint64_t file_size,
const InternalKeyComparator& icomparator,
const FileMetaData& file_meta,
TableReader** table_reader_ptr,
bool for_compaction) {
if (table_reader_ptr != nullptr) {
*table_reader_ptr = nullptr;
}
Cache::Handle* handle = nullptr;
Status s = FindTable(toptions, file_number, file_size, &handle,
nullptr, options.read_tier == kBlockCacheTier);
Cache::Handle* handle = file_meta.table_reader_handle;
Status s;
if (!handle) {
s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
&handle, nullptr, options.read_tier == kBlockCacheTier);
}
if (!s.ok()) {
return NewErrorIterator(s);
}
TableReader* table_reader =
reinterpret_cast<TableReader*>(cache_->Value(handle));
TableReader* table_reader = GetTableReaderFromHandle(handle);
Iterator* result = table_reader->NewIterator(options);
result->RegisterCleanup(&UnrefEntry, cache_, handle);
if (!file_meta.table_reader_handle) {
result->RegisterCleanup(&UnrefEntry, cache_, handle);
}
if (table_reader_ptr != nullptr) {
*table_reader_ptr = table_reader;
}
......@@ -120,22 +133,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
}
Status TableCache::Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool),
bool* table_io,
void (*mark_key_may_exist)(void*)) {
Cache::Handle* handle = nullptr;
Status s = FindTable(storage_options_, file_number, file_size,
&handle, table_io,
options.read_tier == kBlockCacheTier);
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&, bool),
bool* table_io, void (*mark_key_may_exist)(void*)) {
Cache::Handle* handle = file_meta.table_reader_handle;
Status s;
if (!handle) {
s = FindTable(storage_options_, internal_comparator, file_meta.number,
file_meta.file_size, &handle, table_io,
options.read_tier == kBlockCacheTier);
}
if (s.ok()) {
TableReader* t =
reinterpret_cast<TableReader*>(cache_->Value(handle));
TableReader* t = GetTableReaderFromHandle(handle);
s = t->Get(options, k, arg, saver, mark_key_may_exist);
cache_->Release(handle);
if (!file_meta.table_reader_handle) {
ReleaseHandle(handle);
}
} else if (options.read_tier && s.IsIncomplete()) {
// Couldnt find Table in cache but treat as kFound if no_io set
(*mark_key_may_exist)(arg);
......@@ -145,19 +160,17 @@ Status TableCache::Get(const ReadOptions& options,
}
bool TableCache::PrefixMayMatch(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& internal_prefix,
bool* table_io) {
const InternalKeyComparator& icomparator,
uint64_t file_number, uint64_t file_size,
const Slice& internal_prefix, bool* table_io) {
Cache::Handle* handle = nullptr;
Status s = FindTable(storage_options_, file_number,
file_size, &handle, table_io);
Status s = FindTable(storage_options_, icomparator, file_number, file_size,
&handle, table_io);
bool may_match = true;
if (s.ok()) {
TableReader* t =
reinterpret_cast<TableReader*>(cache_->Value(handle));
TableReader* t = GetTableReaderFromHandle(handle);
may_match = t->PrefixMayMatch(internal_prefix);
cache_->Release(handle);
ReleaseHandle(handle);
}
return may_match;
}
......
......@@ -12,15 +12,18 @@
#pragma once
#include <string>
#include <stdint.h>
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "rocksdb/cache.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "table/table_reader.h"
namespace rocksdb {
class Env;
struct FileMetaData;
class TableCache {
public:
......@@ -35,10 +38,9 @@ class TableCache {
// the returned iterator. The returned "*tableptr" object is owned by
// the cache and should not be deleted, and is valid for as long as the
// returned iterator is live.
Iterator* NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
uint64_t file_number,
uint64_t file_size,
Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
TableReader** table_reader_ptr = nullptr,
bool for_compaction = false);
......@@ -46,33 +48,40 @@ class TableCache {
// call (*handle_result)(arg, found_key, found_value) repeatedly until
// it returns false.
Status Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
bool (*handle_result)(void*, const Slice&, const Slice&, bool),
bool* table_io,
void (*mark_key_may_exist)(void*) = nullptr);
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, void* arg,
bool (*handle_result)(void*, const ParsedInternalKey&,
const Slice&, bool),
bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
// Determine whether the table may contain the specified prefix. If
// the table index of blooms are not in memory, this may cause an I/O
bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
uint64_t file_size, const Slice& internal_prefix,
bool* table_io);
// the table index or blooms are not in memory, this may cause an I/O
bool PrefixMayMatch(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
uint64_t file_number, uint64_t file_size,
const Slice& internal_prefix, bool* table_io);
// Evict any entry for the specified file number
static void Evict(Cache* cache, uint64_t file_number);
// Find table reader
Status FindTable(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
uint64_t file_number, uint64_t file_size, Cache::Handle**,
bool* table_io = nullptr, const bool no_io = false);
// Get TableReader from a cache handle.
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
// Release the handle from a cache
void ReleaseHandle(Cache::Handle* handle);
private:
Env* const env_;
const std::string dbname_;
const Options* options_;
const EnvOptions& storage_options_;
Cache* const cache_;
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
const bool no_io = false);
};
} // namespace rocksdb
......@@ -10,87 +10,6 @@
namespace rocksdb {
namespace {
void AppendProperty(
std::string& props,
const std::string& key,
const std::string& value,
const std::string& prop_delim,
const std::string& kv_delim) {
props.append(key);
props.append(kv_delim);
props.append(value);
props.append(prop_delim);
}
template <class TValue>
void AppendProperty(
std::string& props,
const std::string& key,
const TValue& value,
const std::string& prop_delim,
const std::string& kv_delim) {
AppendProperty(
props, key, std::to_string(value), prop_delim, kv_delim
);
}
}
std::string TableProperties::ToString(
const std::string& prop_delim,
const std::string& kv_delim) const {
std::string result;
result.reserve(1024);
// Basic Info
AppendProperty(
result, "# data blocks", num_data_blocks, prop_delim, kv_delim
);
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
AppendProperty(
result,
"raw average key size",
num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
prop_delim,
kv_delim
);
AppendProperty(
result, "raw value size", raw_value_size, prop_delim, kv_delim
);
AppendProperty(
result,
"raw average value size",
num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
prop_delim,
kv_delim
);
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
AppendProperty(
result, "filter block size", filter_size, prop_delim, kv_delim
);
AppendProperty(
result,
"(estimated) table size",
data_size + index_size + filter_size,
prop_delim,
kv_delim
);
AppendProperty(
result,
"filter policy name",
filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
prop_delim,
kv_delim
);
return result;
}
Status InternalKeyPropertiesCollector::Add(
const Slice& key, const Slice& value) {
ParsedInternalKey ikey;
......@@ -106,7 +25,7 @@ Status InternalKeyPropertiesCollector::Add(
}
Status InternalKeyPropertiesCollector::Finish(
TableProperties::UserCollectedProperties* properties) {
UserCollectedProperties* properties) {
assert(properties);
assert(properties->find(
InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
......@@ -118,7 +37,7 @@ Status InternalKeyPropertiesCollector::Finish(
return Status::OK();
}
TableProperties::UserCollectedProperties
UserCollectedProperties
InternalKeyPropertiesCollector::GetReadableProperties() const {
return {
{ "kDeletedKeys", std::to_string(deleted_keys_) }
......@@ -137,11 +56,11 @@ Status UserKeyTablePropertiesCollector::Add(
}
Status UserKeyTablePropertiesCollector::Finish(
TableProperties::UserCollectedProperties* properties) {
UserCollectedProperties* properties) {
return collector_->Finish(properties);
}
TableProperties::UserCollectedProperties
UserCollectedProperties
UserKeyTablePropertiesCollector::GetReadableProperties() const {
return collector_->GetReadableProperties();
}
......@@ -151,7 +70,7 @@ const std::string InternalKeyTablePropertiesNames::kDeletedKeys
= "rocksdb.deleted.keys";
uint64_t GetDeletedKeys(
const TableProperties::UserCollectedProperties& props) {
const UserCollectedProperties& props) {
auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
if (pos == props.end()) {
return 0;
......
......@@ -24,15 +24,13 @@ class InternalKeyPropertiesCollector : public TablePropertiesCollector {
public:
virtual Status Add(const Slice& key, const Slice& value) override;
virtual Status Finish(
TableProperties::UserCollectedProperties* properties) override;
virtual Status Finish(UserCollectedProperties* properties) override;
virtual const char* Name() const override {
return "InternalKeyPropertiesCollector";
}
TableProperties::UserCollectedProperties
GetReadableProperties() const override;
UserCollectedProperties GetReadableProperties() const override;
private:
uint64_t deleted_keys_ = 0;
......@@ -61,13 +59,11 @@ class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
virtual Status Add(const Slice& key, const Slice& value) override;
virtual Status Finish(
TableProperties::UserCollectedProperties* properties) override;
virtual Status Finish(UserCollectedProperties* properties) override;
virtual const char* Name() const override { return collector_->Name(); }
TableProperties::UserCollectedProperties
GetReadableProperties() const override;
UserCollectedProperties GetReadableProperties() const override;
protected:
std::shared_ptr<TablePropertiesCollector> collector_;
......
......@@ -7,12 +7,14 @@
#include <memory>
#include <string>
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/table_properties_collector.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/table.h"
#include "table/block_based_table_factory.h"
#include "table/meta_blocks.h"
#include "table/plain_table_factory.h"
#include "table/table_builder.h"
#include "util/coding.h"
#include "util/testharness.h"
#include "util/testutil.h"
......@@ -20,8 +22,6 @@
namespace rocksdb {
class TablePropertiesTest {
private:
unique_ptr<TableReader> table_reader_;
};
// TODO(kailiu) the following classes should be moved to some more general
......@@ -83,30 +83,13 @@ class DumbLogger : public Logger {
};
// Utilities test functions
void MakeBuilder(
const Options& options,
std::unique_ptr<FakeWritableFile>* writable,
std::unique_ptr<TableBuilder>* builder) {
void MakeBuilder(const Options& options,
const InternalKeyComparator& internal_comparator,
std::unique_ptr<FakeWritableFile>* writable,
std::unique_ptr<TableBuilder>* builder) {
writable->reset(new FakeWritableFile);
builder->reset(
options.table_factory->GetTableBuilder(options, writable->get(),
options.compression));
}
void OpenTable(
const Options& options,
const std::string& contents,
std::unique_ptr<TableReader>* table_reader) {
std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
auto s = options.table_factory->GetTableReader(
options,
EnvOptions(),
std::move(file),
contents.size(),
table_reader
);
ASSERT_OK(s);
builder->reset(options.table_factory->NewTableBuilder(
options, internal_comparator, writable->get(), options.compression));
}
// Collects keys that starts with "A" in a table.
......@@ -114,10 +97,10 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
public:
const char* Name() const { return "RegularKeysStartWithA"; }
Status Finish(TableProperties::UserCollectedProperties* properties) {
Status Finish(UserCollectedProperties* properties) {
std::string encoded;
PutVarint32(&encoded, count_);
*properties = TableProperties::UserCollectedProperties {
*properties = UserCollectedProperties {
{ "TablePropertiesTest", "Rocksdb" },
{ "Count", encoded }
};
......@@ -132,8 +115,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
return Status::OK();
}
virtual TableProperties::UserCollectedProperties
GetReadableProperties() const {
virtual UserCollectedProperties GetReadableProperties() const {
return {};
}
......@@ -142,23 +124,65 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
uint32_t count_ = 0;
};
TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
Options options;
extern uint64_t kBlockBasedTableMagicNumber;
extern uint64_t kPlainTableMagicNumber;
void TestCustomizedTablePropertiesCollector(
uint64_t magic_number, bool encode_as_internal, const Options& options,
const InternalKeyComparator& internal_comparator) {
// make sure the entries will be inserted with order.
std::map<std::string, std::string> kvs = {
{"About", "val5"}, // starts with 'A'
{"Abstract", "val2"}, // starts with 'A'
{"Around", "val7"}, // starts with 'A'
{"Beyond", "val3"},
{"Builder", "val1"},
{"Cancel", "val4"},
{"Find", "val6"},
{"About ", "val5"}, // starts with 'A'
{"Abstract", "val2"}, // starts with 'A'
{"Around ", "val7"}, // starts with 'A'
{"Beyond ", "val3"},
{"Builder ", "val1"},
{"Cancel ", "val4"},
{"Find ", "val6"},
};
// -- Step 1: build table
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
MakeBuilder(options, internal_comparator, &writable, &builder);
for (const auto& kv : kvs) {
if (encode_as_internal) {
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
builder->Add(ikey.Encode(), kv.second);
} else {
builder->Add(kv.first, kv.second);
}
}
ASSERT_OK(builder->Finish());
// -- Step 2: Read properties
FakeRandomeAccessFile readable(writable->contents());
TableProperties props;
Status s = ReadTableProperties(
&readable,
writable->contents().size(),
magic_number,
Env::Default(),
nullptr,
&props
);
ASSERT_OK(s);
auto user_collected = props.user_collected_properties;
ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
uint32_t starts_with_A = 0;
Slice key(user_collected.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(3u, starts_with_A);
}
TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
// Test properties collectors with internal keys or regular keys
// for block based table
for (bool encode_as_internal : { true, false }) {
// -- Step 1: build table
Options options;
auto collector = new RegularKeysStartWithA();
if (encode_as_internal) {
options.table_properties_collectors = {
......@@ -168,97 +192,111 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
options.table_properties_collectors.resize(1);
options.table_properties_collectors[0].reset(collector);
}
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
MakeBuilder(options, &writable, &builder);
for (const auto& kv : kvs) {
if (encode_as_internal) {
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
builder->Add(ikey.Encode(), kv.second);
} else {
builder->Add(kv.first, kv.second);
}
}
ASSERT_OK(builder->Finish());
// -- Step 2: Open table
std::unique_ptr<TableReader> table_reader;
OpenTable(options, writable->contents(), &table_reader);
const auto& properties =
table_reader->GetTableProperties().user_collected_properties;
ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest"));
uint32_t starts_with_A = 0;
Slice key(properties.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(3u, starts_with_A);
test::PlainInternalKeyComparator ikc(options.comparator);
TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
encode_as_internal, options, ikc);
}
// test plain table
Options options;
options.table_properties_collectors.push_back(
std::make_shared<RegularKeysStartWithA>()
);
options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
test::PlainInternalKeyComparator ikc(options.comparator);
TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
ikc);
}
TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
void TestInternalKeyPropertiesCollector(
uint64_t magic_number,
bool sanitized,
std::shared_ptr<TableFactory> table_factory) {
InternalKey keys[] = {
InternalKey("A", 0, ValueType::kTypeValue),
InternalKey("B", 0, ValueType::kTypeValue),
InternalKey("C", 0, ValueType::kTypeValue),
InternalKey("W", 0, ValueType::kTypeDeletion),
InternalKey("X", 0, ValueType::kTypeDeletion),
InternalKey("Y", 0, ValueType::kTypeDeletion),
InternalKey("Z", 0, ValueType::kTypeDeletion),
InternalKey("A ", 0, ValueType::kTypeValue),
InternalKey("B ", 0, ValueType::kTypeValue),
InternalKey("C ", 0, ValueType::kTypeValue),
InternalKey("W ", 0, ValueType::kTypeDeletion),
InternalKey("X ", 0, ValueType::kTypeDeletion),
InternalKey("Y ", 0, ValueType::kTypeDeletion),
InternalKey("Z ", 0, ValueType::kTypeDeletion),
};
for (bool sanitized : { false, true }) {
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
Options options;
if (sanitized) {
options.table_properties_collectors = {
std::make_shared<RegularKeysStartWithA>()
};
// with sanitization, even regular properties collector will be able to
// handle internal keys.
auto comparator = options.comparator;
// HACK: Set options.info_log to avoid writing log in
// SanitizeOptions().
options.info_log = std::make_shared<DumbLogger>();
options = SanitizeOptions(
"db", // just a place holder
nullptr, // with skip internal key comparator
nullptr, // don't care filter policy
options
);
options.comparator = comparator;
} else {
options.table_properties_collectors = {
std::make_shared<InternalKeyPropertiesCollector>()
};
}
MakeBuilder(options, &writable, &builder);
for (const auto& k : keys) {
builder->Add(k.Encode(), "val");
}
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
Options options;
test::PlainInternalKeyComparator pikc(options.comparator);
options.table_factory = table_factory;
if (sanitized) {
options.table_properties_collectors = {
std::make_shared<RegularKeysStartWithA>()
};
// with sanitization, even regular properties collector will be able to
// handle internal keys.
auto comparator = options.comparator;
// HACK: Set options.info_log to avoid writing log in
// SanitizeOptions().
options.info_log = std::make_shared<DumbLogger>();
options = SanitizeOptions("db", // just a place holder
&pikc, nullptr, // don't care filter policy
options);
options.comparator = comparator;
} else {
options.table_properties_collectors = {
std::make_shared<InternalKeyPropertiesCollector>()
};
}
ASSERT_OK(builder->Finish());
MakeBuilder(options, pikc, &writable, &builder);
for (const auto& k : keys) {
builder->Add(k.Encode(), "val");
}
std::unique_ptr<TableReader> table_reader;
OpenTable(options, writable->contents(), &table_reader);
const auto& properties =
table_reader->GetTableProperties().user_collected_properties;
ASSERT_OK(builder->Finish());
FakeRandomeAccessFile readable(writable->contents());
TableProperties props;
Status s = ReadTableProperties(
&readable,
writable->contents().size(),
magic_number,
Env::Default(),
nullptr,
&props
);
ASSERT_OK(s);
uint64_t deleted = GetDeletedKeys(properties);
ASSERT_EQ(4u, deleted);
auto user_collected = props.user_collected_properties;
uint64_t deleted = GetDeletedKeys(user_collected);
ASSERT_EQ(4u, deleted);
if (sanitized) {
uint32_t starts_with_A = 0;
Slice key(properties.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(1u, starts_with_A);
}
if (sanitized) {
uint32_t starts_with_A = 0;
Slice key(user_collected.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(1u, starts_with_A);
}
}
TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
TestInternalKeyPropertiesCollector(
kBlockBasedTableMagicNumber,
true /* sanitize */,
std::make_shared<BlockBasedTableFactory>()
);
TestInternalKeyPropertiesCollector(
kBlockBasedTableMagicNumber,
true /* not sanitize */,
std::make_shared<BlockBasedTableFactory>()
);
TestInternalKeyPropertiesCollector(
kPlainTableMagicNumber,
false /* not sanitize */,
std::make_shared<PlainTableFactory>(8, 8, 0)
);
}
} // namespace rocksdb
int main(int argc, char** argv) {
......
......@@ -78,12 +78,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint64(dst, last_sequence_);
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
for (const auto& deleted : deleted_files_) {
PutVarint32(dst, kDeletedFile);
PutVarint32(dst, iter->first); // level
PutVarint64(dst, iter->second); // file number
PutVarint32(dst, deleted.first /* level */);
PutVarint64(dst, deleted.second /* file number */);
}
for (size_t i = 0; i < new_files_.size(); i++) {
......
......@@ -12,6 +12,7 @@
#include <utility>
#include <vector>
#include <string>
#include "rocksdb/cache.h"
#include "db/dbformat.h"
namespace rocksdb {
......@@ -29,8 +30,17 @@ struct FileMetaData {
SequenceNumber smallest_seqno;// The smallest seqno in this file
SequenceNumber largest_seqno; // The largest seqno in this file
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
being_compacted(false) {}
// Needs to be disposed when refs becomes 0.
Cache::Handle* table_reader_handle;
FileMetaData(uint64_t number, uint64_t file_size)
: refs(0),
allowed_seeks(1 << 30),
number(number),
file_size(file_size),
being_compacted(false),
table_reader_handle(nullptr) {}
FileMetaData() : FileMetaData(0, 0) {}
};
class VersionEdit {
......@@ -70,6 +80,7 @@ class VersionEdit {
const InternalKey& largest,
const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno) {
assert(smallest_seqno <= largest_seqno);
FileMetaData f;
f.number = file;
f.file_size = file_size;
......@@ -77,13 +88,12 @@ class VersionEdit {
f.largest = largest;
f.smallest_seqno = smallest_seqno;
f.largest_seqno = largest_seqno;
assert(smallest_seqno <= largest_seqno);
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
deleted_files_.insert({level, file});
}
// Number of edits
......@@ -120,7 +130,7 @@ class VersionEdit {
private:
friend class VersionSet;
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
bool GetLevel(Slice* input, int* level, const char** msg);
......
此差异已折叠。
......@@ -85,8 +85,8 @@ class Version {
};
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
Status* status, MergeContext* merge_context,
GetStats* stats, const Options& db_option, bool* value_found =
nullptr);
GetStats* stats, const Options& db_option,
bool* value_found = nullptr);
// Adds "stats" into the current state. Returns true if a new
// compaction may need to be triggered, false otherwise.
......@@ -101,7 +101,9 @@ class Version {
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
// Decrease reference count. Delete the object if no reference left
// and return true. Otherwise, return false.
bool Unref();
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const;
......@@ -384,7 +386,7 @@ class VersionSet {
bool VerifyCompactionFileConsistency(Compaction* c);
Status GetMetadataForFile(uint64_t number, int* filelevel,
FileMetaData* metadata, ColumnFamilyData** cfd);
FileMetaData** metadata, ColumnFamilyData** cfd);
void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册