diff --git a/CMakeLists.txt b/CMakeLists.txt index 4edb525f9e458e1c075e9628fb05adb31180854a..b60a030b54ebf92f8fc89b96bcdc27c060a9fa79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,7 @@ add_custom_command(OUTPUT ${BUILD_VERSION_CC} add_custom_target(GenerateBuildVersion DEPENDS ${BUILD_VERSION_CC}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /WX /wd4100 /wd4101 /wd4127 /wd4189 /wd4200 /wd4244 /wd4267 /wd4296 /wd4305 /wd4307 /wd4309 /wd4512 /wd4701 /wd4702 /wd4800 /wd4804 /wd4996") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /WX /wd4127 /wd4200 /wd4244 /wd4267 /wd4305 /wd4307 /wd4309 /wd4512 /wd4701 /wd4702 /wd4800 /wd4804 /wd4996") # Used to run CI build and tests so we can run faster set(OPTIMIZE_DEBUG_DEFAULT 0) # Debug build is unoptimized by default use -DOPTDBG=1 to optimize diff --git a/db/c.cc b/db/c.cc index ec8e4fb9944d2e82e1d4df12f7eff0a55b2096a7..3c9842978748761b2fe601ff5af4b6517a7b31ae 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1288,6 +1288,11 @@ void rocksdb_block_based_options_set_cache_index_and_filter_blocks( options->rep.cache_index_and_filter_blocks = v; } +void rocksdb_block_based_options_set_skip_table_builder_flush( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.skip_table_builder_flush = v; +} + void rocksdb_options_set_block_based_table_factory( rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options) { diff --git a/db/db_bench.cc b/db/db_bench.cc index bb21115360416d468804bc6e05b46996aae98106..ebe74b6bf5d5f046c173ea350aa4849803c46750 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -376,6 +376,12 @@ DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size"); DEFINE_int32(random_access_max_buffer_size, 1024 * 1024, "Maximum windows randomaccess buffer size"); +DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024, + "Maximum write buffer for Writeable File"); + +DEFINE_int32(skip_table_builder_flush, false, "Skip flushing block in " + "table builder "); + DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means" " use default settings."); DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. " @@ -2299,6 +2305,7 @@ class Benchmark { FLAGS_new_table_reader_for_compaction_inputs; options.compaction_readahead_size = FLAGS_compaction_readahead_size; options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size; + options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size; options.statistics = dbstats; if (FLAGS_enable_io_prio) { FLAGS_env->LowerThreadPoolIOPriority(Env::LOW); @@ -2441,6 +2448,8 @@ class Benchmark { block_based_options.block_size = FLAGS_block_size; block_based_options.block_restart_interval = FLAGS_block_restart_interval; block_based_options.filter_policy = filter_policy_; + block_based_options.skip_table_builder_flush = + FLAGS_skip_table_builder_flush; block_based_options.format_version = 2; options.table_factory.reset( NewBlockBasedTableFactory(block_based_options)); diff --git a/db/memtablerep_bench.cc b/db/memtablerep_bench.cc index 5590bad5b773690fc05f73abff696d52a7d7530a..42edfdfc7e095150341008054b331841dc6a0f76 100644 --- a/db/memtablerep_bench.cc +++ b/db/memtablerep_bench.cc @@ -132,8 +132,6 @@ DEFINE_int64(seed, 0, "Seed base for random number generators. " "When 0 it is deterministic."); -static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); - namespace rocksdb { namespace { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 76c801e67a45d7b73004883733d43d3c6929cd50..6bb3b85383cdcf5f5e2261bc1b0b7a1c26569e3b 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -450,6 +450,9 @@ rocksdb_block_based_options_set_hash_index_allow_collision( extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_cache_index_and_filter_blocks( rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_skip_table_builder_flush( + rocksdb_block_based_table_options_t* options, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 9e20a15aa00c61b9067e4ab8cf873374f053b087..bbc2de579c2760483fe45e7f1b96553aecb67afd 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -94,6 +94,9 @@ struct EnvOptions { // See DBOPtions doc size_t random_access_max_buffer_size; + // See DBOptions doc + size_t writable_file_max_buffer_size = 1024 * 1024; + // If not nullptr, write rate limiting is enabled for flush and compaction RateLimiter* rate_limiter = nullptr; }; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index acd0016ac5ae9c33248c52014e01a0bcc18b23bc..567c3a8671d5c3dda1024e6b303232deae4a46f8 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1089,6 +1089,14 @@ struct DBOptions { // Default: 1 Mb size_t random_access_max_buffer_size; + // This is the maximum buffer size that is used by WritableFileWriter. + // On Windows, we need to maintain an aligned buffer for writes. + // We allow the buffer to grow until it's size hits the limit. + // + // Default: 1024 * 1024 (1 MB) + size_t writable_file_max_buffer_size; + + // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not // heavily contended. However, if the mutex is hot, we could end up diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 4f62debd29441ee648805f2051730e8a95d3bdfd..932c77df5559bbb0245df8a41eda9bc43c8cb13d 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -128,6 +128,20 @@ struct BlockBasedTableOptions { // This must generally be true for gets to be efficient. bool whole_key_filtering = true; + // If true, block will not be explictly flushed to disk during building + // a SstTable. Instead, buffer in WritableFileWriter will take + // care of the flushing when it is full. + // + // On Windows, this option helps a lot when unbuffered I/O + // (allow_os_buffer = false) is used, since it avoids small + // unbuffered disk write. + // + // User may also adjust writable_file_max_buffer_size to optimize disk I/O + // size. + // + // Default: false + bool skip_table_builder_flush = false; + // We currently have three versions: // 0 -- This version is currently written out by all RocksDB's versions by // default. Can be read by really old RocksDB's. Doesn't support changing diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 402528a52df122f90584e82d6d4a83acc163e85d..319235fbe625ec09a8f7e800635b68484eef8c47 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -592,7 +592,7 @@ void BlockBasedTableBuilder::Flush() { if (!ok()) return; if (r->data_block.empty()) return; WriteBlock(&r->data_block, &r->pending_handle); - if (ok()) { + if (ok() && !r->table_options.skip_table_builder_flush) { r->status = r->file->Flush(); } if (r->filter_block != nullptr) { diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 1631652ddafedb4fe797f4f51709f6dce7fd9ed9..9805a28ec3acb41bcf678d90e53f0faad2302bb8 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -152,6 +152,10 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { ret.append(buffer); snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", table_options_.whole_key_filtering); + ret.append(buffer); + snprintf(buffer, kBufferSize, " skip_table_builder_flush: %d\n", + table_options_.skip_table_builder_flush); + ret.append(buffer); snprintf(buffer, kBufferSize, " format_version: %d\n", table_options_.format_version); ret.append(buffer); diff --git a/util/env.cc b/util/env.cc index f6cc40893409b5cf0237805cb4f08370f2a85294..5042251a8b5e82c86b7bb462cddfbac8e4a2f0cb 100644 --- a/util/env.cc +++ b/util/env.cc @@ -296,6 +296,8 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { env_options->random_access_max_buffer_size = options.random_access_max_buffer_size; env_options->rate_limiter = options.rate_limiter.get(); + env_options->writable_file_max_buffer_size = + options.writable_file_max_buffer_size; env_options->allow_fallocate = options.allow_fallocate; } diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 050473bd2c25fd79c3e26d13d6cb5e8400e7bc5d..f5c17889667ff65958f40189d735396caf12f626 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -21,10 +21,6 @@ namespace rocksdb { -namespace { - const size_t c_OneMb = (1 << 20); -} - Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { Status s = file_->Read(n, result, scratch); IOSTATS_ADD(bytes_read, result->size()); @@ -76,9 +72,9 @@ Status WritableFileWriter::Append(const Slice& data) { } } - if (buf_.Capacity() < c_OneMb) { + if (buf_.Capacity() < max_buffer_size_) { size_t desiredCapacity = buf_.Capacity() * 2; - desiredCapacity = std::min(desiredCapacity, c_OneMb); + desiredCapacity = std::min(desiredCapacity, max_buffer_size_); buf_.AllocateNewBuffer(desiredCapacity); } assert(buf_.CurrentSize() == 0); @@ -102,9 +98,9 @@ Status WritableFileWriter::Append(const Slice& data) { // We double the buffer here because // Flush calls do not keep up with the incoming bytes // This is the only place when buffer is changed with unbuffered I/O - if (buf_.Capacity() < c_OneMb) { + if (buf_.Capacity() < max_buffer_size_) { size_t desiredCapacity = buf_.Capacity() * 2; - desiredCapacity = std::min(desiredCapacity, c_OneMb); + desiredCapacity = std::min(desiredCapacity, max_buffer_size_); buf_.AllocateNewBuffer(desiredCapacity); } } @@ -156,7 +152,6 @@ Status WritableFileWriter::Close() { return s; } - // write out the cached data to the OS cache Status WritableFileWriter::Flush() { Status s; diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 9a076af5668f27417083ce726d1cdba5096edfc8..720979099fbb93b887d7b5ef4815431cde1f18c9 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -93,6 +93,7 @@ class WritableFileWriter { private: std::unique_ptr writable_file_; AlignedBuffer buf_; + size_t max_buffer_size_; // Actually written data size can be used for truncate // not counting padding data uint64_t filesize_; @@ -113,6 +114,7 @@ class WritableFileWriter { const EnvOptions& options) : writable_file_(std::move(file)), buf_(), + max_buffer_size_(options.writable_file_max_buffer_size), filesize_(0), next_write_offset_(0), pending_sync_(false), diff --git a/util/options.cc b/util/options.cc index 126aa212109f4812d546e67e7d7e9a79a7337739..027504ac642a6ca50657ad041b27e2475aee6cec 100644 --- a/util/options.cc +++ b/util/options.cc @@ -251,6 +251,7 @@ DBOptions::DBOptions() new_table_reader_for_compaction_inputs(false), compaction_readahead_size(0), random_access_max_buffer_size(1024 * 1024), + writable_file_max_buffer_size(1024 * 1024), use_adaptive_mutex(false), bytes_per_sync(0), wal_bytes_per_sync(0), @@ -313,6 +314,7 @@ DBOptions::DBOptions(const Options& options) options.new_table_reader_for_compaction_inputs), compaction_readahead_size(options.compaction_readahead_size), random_access_max_buffer_size(options.random_access_max_buffer_size), + writable_file_max_buffer_size(options.writable_file_max_buffer_size), use_adaptive_mutex(options.use_adaptive_mutex), bytes_per_sync(options.bytes_per_sync), wal_bytes_per_sync(options.wal_bytes_per_sync), @@ -412,6 +414,10 @@ void DBOptions::Dump(Logger* log) const { " Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt "d", random_access_max_buffer_size); + Header(log, + " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt + "d", + writable_file_max_buffer_size); Header(log, " Options.use_adaptive_mutex: %d", use_adaptive_mutex); Header(log, " Options.rate_limiter: %p", diff --git a/util/options_helper.cc b/util/options_helper.cc index f1f04c481a8d859f328a96d1d683f3350ffc9455..fa8dfb49fb0eb28517ab5eb87300c560d7f7a337 100644 --- a/util/options_helper.cc +++ b/util/options_helper.cc @@ -891,7 +891,7 @@ bool ParseColumnFamilyOption(const std::string& name, reinterpret_cast(new_options) + opt_info.offset, opt_info.type, value); } - } catch (std::exception& e) { + } catch (const std::exception&) { return false; } return true; @@ -1058,7 +1058,7 @@ bool ParseDBOption(const std::string& name, const std::string& org_value, opt_info.type, value); } } - } catch (const std::exception& e) { + } catch (const std::exception&) { return false; } return true; diff --git a/util/options_helper.h b/util/options_helper.h index 5c7277f467780b9d1a3fd87076cd5dbbc1ace4d1..4dead5507ccf95ea7aaf2ee022c956ffc997d49f 100644 --- a/util/options_helper.h +++ b/util/options_helper.h @@ -184,6 +184,9 @@ static std::unordered_map db_options_type_info = { {"random_access_max_buffer_size", {offsetof(struct DBOptions, random_access_max_buffer_size), OptionType::kSizeT, OptionVerificationType::kNormal}}, + {"writable_file_max_buffer_size", + {offsetof(struct DBOptions, writable_file_max_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal}}, {"use_adaptive_mutex", {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean, OptionVerificationType::kNormal}}, @@ -460,6 +463,9 @@ static std::unordered_map(47)); ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast(48)); } @@ -621,7 +623,8 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" "block_cache=1M;block_cache_compressed=1k;block_size=1024;" "block_size_deviation=8;block_restart_interval=4;" - "filter_policy=bloomfilter:4:true;whole_key_filtering=1", + "filter_policy=bloomfilter:4:true;whole_key_filtering=1;" + "skip_table_builder_flush=1", &new_opt)); ASSERT_TRUE(new_opt.cache_index_and_filter_blocks); ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch); @@ -636,6 +639,7 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_size_deviation, 8); ASSERT_EQ(new_opt.block_restart_interval, 4); ASSERT_TRUE(new_opt.filter_policy != nullptr); + ASSERT_TRUE(new_opt.skip_table_builder_flush); // unknown option ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,