diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 4c3a2b84fbc368a1a6ae9d91d0e0a5183fce7826..776ef357a10c4d15c9a5f2a2d5cd0b404c003e5a 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1027,8 +1027,8 @@ DEFINE_int32(open_write_fault_one_in, 0, DEFINE_int32(open_read_fault_one_in, 0, "On non-zero, enables fault injection on file reads " "during DB reopen."); -DEFINE_int32(injest_error_severity, 1, - "The severity of the injested IO Error. 1 is soft error (e.g. " +DEFINE_int32(inject_error_severity, 1, + "The severity of the injected IO Error. 1 is soft error (e.g. " "retryable error), 2 is fatal error, and the default is " "retryable error."); DEFINE_int32(prepopulate_block_cache, diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h index 97bbdaefa465845c06b6f6ac8af404cd33379c1f..998a6cc74b9d0e0048da37082259d34e49698a6d 100644 --- a/db_stress_tool/db_stress_listener.h +++ b/db_stress_tool/db_stress_listener.h @@ -9,6 +9,7 @@ #include #include +#include "db_stress_tool/db_stress_shared_state.h" #include "file/filename.h" #include "file/writable_file_writer.h" #include "rocksdb/db.h" @@ -19,9 +20,12 @@ #include "rocksdb/unique_id.h" #include "util/gflags_compat.h" #include "util/random.h" +#include "utilities/fault_injection_fs.h" DECLARE_int32(compact_files_one_in); +extern std::shared_ptr fault_fs_guard; + namespace ROCKSDB_NAMESPACE { // Verify across process executions that all seen IDs are unique @@ -95,6 +99,17 @@ class DbStressListener : public EventListener { RandomSleep(); } + void OnSubcompactionBegin(const SubcompactionJobInfo& /* si */) override { + if (FLAGS_read_fault_one_in) { + // Hardcoded to inject retryable error as a non-retryable error would put + // the DB in read-only mode and then it would crash on the next write. + fault_fs_guard->SetThreadLocalReadErrorContext( + static_cast(FLAGS_seed), FLAGS_read_fault_one_in, + true /* retryable */); + fault_fs_guard->EnableErrorInjection(); + } + } + void OnTableFileCreationStarted( const TableFileCreationBriefInfo& /*info*/) override { ++num_pending_file_creations_; diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index 604e8c6313f23b046daee3b78ecdc8ba39717a54..bad6a77e1fa52f8eaf3572df0932a6504e379431 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -35,7 +35,7 @@ DECLARE_int32(open_metadata_write_fault_one_in); DECLARE_int32(open_write_fault_one_in); DECLARE_int32(open_read_fault_one_in); -DECLARE_int32(injest_error_severity); +DECLARE_int32(inject_error_severity); namespace ROCKSDB_NAMESPACE { class StressTest; diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index aff559a639163a05cd05548fb6ffe155c6c8080d..0195971c050795ceb4d6377efa28b6e34480cc96 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -773,17 +773,18 @@ void StressTest::OperateDb(ThreadState* thread) { #ifndef NDEBUG if (FLAGS_read_fault_one_in) { - fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(), - FLAGS_read_fault_one_in); + fault_fs_guard->SetThreadLocalReadErrorContext( + thread->shared->GetSeed(), FLAGS_read_fault_one_in, + FLAGS_inject_error_severity == 1 /* retryable */); } #endif // NDEBUG if (FLAGS_write_fault_one_in) { IOStatus error_msg; - if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) { + if (FLAGS_inject_error_severity <= 1 || FLAGS_inject_error_severity > 2) { error_msg = IOStatus::IOError("Retryable IO Error"); error_msg.SetRetryable(true); - } else if (FLAGS_injest_error_severity == 2) { - // Ingest the fatal error + } else if (FLAGS_inject_error_severity == 2) { + // Inject a fatal error error_msg = IOStatus::IOError("Fatal IO Error"); error_msg.SetDataLoss(true); } @@ -2684,14 +2685,14 @@ void StressTest::Open(SharedState* shared, bool reopen) { RegisterAdditionalListeners(); if (!FLAGS_use_txn) { - // Determine whether we need to ingest file metadata write failures + // Determine whether we need to inject file metadata write failures // during DB reopen. If it does, enable it. - // Only ingest metadata error if it is reopening, as initial open + // Only inject metadata error if it is reopening, as initial open // failure doesn't need to be handled. // TODO cover transaction DB is not covered in this fault test too. - bool ingest_meta_error = false; - bool ingest_write_error = false; - bool ingest_read_error = false; + bool inject_meta_error = false; + bool inject_write_error = false; + bool inject_read_error = false; if ((FLAGS_open_metadata_write_fault_one_in || FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) && fault_fs_guard @@ -2704,15 +2705,15 @@ void StressTest::Open(SharedState* shared, bool reopen) { // solve it, skip WAL from failure injection. fault_fs_guard->SetSkipDirectWritableTypes({kWalFile}); } - ingest_meta_error = FLAGS_open_metadata_write_fault_one_in; - ingest_write_error = FLAGS_open_write_fault_one_in; - ingest_read_error = FLAGS_open_read_fault_one_in; - if (ingest_meta_error) { + inject_meta_error = FLAGS_open_metadata_write_fault_one_in; + inject_write_error = FLAGS_open_write_fault_one_in; + inject_read_error = FLAGS_open_read_fault_one_in; + if (inject_meta_error) { fault_fs_guard->EnableMetadataWriteErrorInjection(); fault_fs_guard->SetRandomMetadataWriteError( FLAGS_open_metadata_write_fault_one_in); } - if (ingest_write_error) { + if (inject_write_error) { fault_fs_guard->SetFilesystemDirectWritable(false); fault_fs_guard->EnableWriteErrorInjection(); fault_fs_guard->SetRandomWriteError( @@ -2720,7 +2721,7 @@ void StressTest::Open(SharedState* shared, bool reopen) { IOStatus::IOError("Injected Open Error"), /*inject_for_all_file_types=*/true, /*types=*/{}); } - if (ingest_read_error) { + if (inject_read_error) { fault_fs_guard->SetRandomReadError(FLAGS_open_read_fault_one_in); } } @@ -2752,14 +2753,14 @@ void StressTest::Open(SharedState* shared, bool reopen) { } } - if (ingest_meta_error || ingest_write_error || ingest_read_error) { + if (inject_meta_error || inject_write_error || inject_read_error) { fault_fs_guard->SetFilesystemDirectWritable(true); fault_fs_guard->DisableMetadataWriteErrorInjection(); fault_fs_guard->DisableWriteErrorInjection(); fault_fs_guard->SetSkipDirectWritableTypes({}); fault_fs_guard->SetRandomReadError(0); if (s.ok()) { - // Ingested errors might happen in background compactions. We + // Injected errors might happen in background compactions. We // wait for all compactions to finish to make sure DB is in // clean state before executing queries. s = db_->GetRootDB()->WaitForCompact(WaitForCompactOptions()); @@ -2776,9 +2777,9 @@ void StressTest::Open(SharedState* shared, bool reopen) { // After failure to opening a DB due to IO error, retry should // successfully open the DB with correct data if no IO error shows // up. - ingest_meta_error = false; - ingest_write_error = false; - ingest_read_error = false; + inject_meta_error = false; + inject_write_error = false; + inject_read_error = false; // TODO: Unsynced data loss during DB reopen is not supported yet in // stress test. Will need to recreate expected state if we decide diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index f822a76631ecb3396d1110af510b8d55f53b8f39..1d2e4a3b5717fdbc4cec9a2284b1db12e28d7836 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -1312,7 +1312,7 @@ class NonBatchedOpsStressTest : public StressTest { pending_expected_value.Commit(); if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; } else if (!is_db_stopped_ || @@ -1371,7 +1371,7 @@ class NonBatchedOpsStressTest : public StressTest { thread->stats.AddDeletes(1); if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; @@ -1402,7 +1402,7 @@ class NonBatchedOpsStressTest : public StressTest { pending_expected_value.Commit(); thread->stats.AddSingleDeletes(1); if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; @@ -1464,7 +1464,7 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->DeleteRange(write_opts, cfh, key, end_key); } if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; } else if (!is_db_stopped_ || diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 72ee148658c6210b3d8c81d6439065e3809726c6..8f3219ee610585dde2d32e593022f52293f2cfd4 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -158,9 +158,8 @@ default_params = { "sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]), "bytes_per_sync": lambda: random.choice([0, 262144]), "wal_bytes_per_sync": lambda: random.choice([0, 524288]), - # Disable compaction_readahead_size because the test is not passing. - # "compaction_readahead_size" : lambda : random.choice( - # [0, 0, 1024 * 1024]), + "compaction_readahead_size" : lambda : random.choice( + [0, 0, 1024 * 1024]), "db_write_buffer_size": lambda: random.choice( [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024] ), diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index fa15fc4a5d766fdd2b1ea64d749709c853f4407c..8db8be45f7cf8656da42d0fea05999a38f1b5331 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -956,6 +956,7 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( return IOStatus::OK(); } + IOStatus ret; if (ctx->rand.OneIn(ctx->one_in)) { if (ctx->count == 0) { ctx->message = ""; @@ -972,7 +973,7 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( // Likely non-per read status code for MultiRead ctx->message += "error; "; ret_fault_injected = true; - return IOStatus::IOError(); + ret = IOStatus::IOError(); } else if (Random::GetTLSInstance()->OneIn(8)) { assert(result); // For a small chance, set the failure to status but turn the @@ -1000,10 +1001,13 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( } else { ctx->message += "error result multiget single; "; ret_fault_injected = true; - return IOStatus::IOError(); + ret = IOStatus::IOError(); } } - return IOStatus::OK(); + if (ctx->retryable) { + ret.SetRetryable(true); + } + return ret; } bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name, diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index cab0051bd14408d0c446d23e1a6fe1ca63d1ffe2..a481d86af22aa3863a69e3f5c19dbd7e075315ef 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -402,7 +402,8 @@ class FaultInjectionTestFS : public FileSystemWrapper { // seed is the seed for the random number generator, and one_in determines // the probability of injecting error (i.e an error is injected with // 1/one_in probability) - void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) { + void SetThreadLocalReadErrorContext(uint32_t seed, int one_in, + bool retryable) { struct ErrorContext* ctx = static_cast(thread_local_error_->Get()); if (ctx == nullptr) { @@ -411,6 +412,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { } ctx->one_in = one_in; ctx->count = 0; + ctx->retryable = retryable; } static void DeleteThreadLocalErrorContext(void* p) { @@ -556,12 +558,14 @@ class FaultInjectionTestFS : public FileSystemWrapper { std::string message; int frames; ErrorType type; + bool retryable; explicit ErrorContext(uint32_t seed) : rand(seed), enable_error_injection(false), callstack(nullptr), - frames(0) {} + frames(0), + retryable(false) {} ~ErrorContext() { if (callstack) { free(callstack);