From 499c9448d07ab10abaa75d6e51d30fe434c311c0 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 3 Sep 2020 20:11:45 -0700 Subject: [PATCH] Fix, enable, and enhance backup/restore in db_stress (#7348) Summary: Although added to db_stress, testing of backup/restore was never integrated into the crash test, originally concerned about performance. I've enabled it now and to address the peformance concern, testing backup/restore is always skipped once the db exceeds a certain size threshold, default 100MB. This should provide sufficient opportunity for testing BackupEngine without bogging down everything else with heavier and heavier operations. Also fixed backup/restore in db_stress by making sure PurgeOldBackups can remove manifest files, which are normally kept around for db_stress. Added more coverage of backup options, and up to three backups being saved in one backup directory (in some cases). Pull Request resolved: https://github.com/facebook/rocksdb/pull/7348 Test Plan: ran 'make blackbox_crash_test' for a while, with heightened probabilitly of taking backups (1/10k). Also confirmed with some debug output that the code is being covered, TestBackupRestore only takes a few seconds to complete when triggered, and even at 1/10k and ~50MB database, there's <,~ 1 thread testing backups at any time. Reviewed By: ajkr Differential Revision: D23510835 Pulled By: pdillinger fbshipit-source-id: b6b8735591808141f81f10773ac31634cf03b6c0 --- db_stress_tool/db_stress_common.h | 1 + db_stress_tool/db_stress_env_wrapper.h | 6 ++- db_stress_tool/db_stress_gflags.cc | 4 ++ db_stress_tool/db_stress_test_base.cc | 70 +++++++++++++++++++++++--- tools/db_crashtest.py | 3 ++ 5 files changed, 75 insertions(+), 9 deletions(-) diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 5c2ff989f..4224a0193 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -174,6 +174,7 @@ DECLARE_bool(use_txn); DECLARE_uint64(txn_write_policy); DECLARE_bool(unordered_write); DECLARE_int32(backup_one_in); +DECLARE_uint64(backup_max_size); DECLARE_int32(checkpoint_one_in); DECLARE_int32(ingest_external_file_one_in); DECLARE_int32(ingest_external_file_width); diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index 411150697..484071f10 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -20,10 +20,12 @@ class DbStressEnvWrapper : public EnvWrapper { // We determine whether it is a manifest file by searching a strong, // so that there will be false positive if the directory path contains the // keyword but it is unlikely. - // Checkpoint directory needs to be exempted. + // Checkpoint, backup, and restore directories needs to be exempted. if (!if_preserve_all_manifests || f.find("MANIFEST-") == std::string::npos || - f.find("checkpoint") != std::string::npos) { + f.find("checkpoint") != std::string::npos || + f.find(".backup") != std::string::npos || + f.find(".restore") != std::string::npos) { return target()->DeleteFile(f); } return Status::OK(); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 96e05a155..919a256c2 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -479,6 +479,10 @@ DEFINE_int32(backup_one_in, 0, "every N operations on average. 0 indicates CreateNewBackup() " "is disabled."); +DEFINE_uint64(backup_max_size, 100 * 1024 * 1024, + "If non-zero, skip checking backup/restore when DB size in " + "bytes exceeds this setting."); + DEFINE_int32(checkpoint_one_in, 0, "If non-zero, then CreateCheckpoint() will be called once for " "every N operations on average. 0 indicates CreateCheckpoint() " diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index b0ab78adb..7dcce08b2 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -667,10 +667,23 @@ void StressTest::OperateDb(ThreadState* thread) { } if (thread->rand.OneInOpt(FLAGS_backup_one_in)) { - Status s = TestBackupRestore(thread, rand_column_families, rand_keys); - if (!s.ok()) { - VerificationAbort(shared, "Backup/restore gave inconsistent state", - s); + // Beyond a certain DB size threshold, this test becomes heavier than + // it's worth. + uint64_t total_size = 0; + if (FLAGS_backup_max_size > 0) { + std::vector files; + db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files); + for (auto& file : files) { + total_size += file.size_bytes; + } + } + + if (total_size <= FLAGS_backup_max_size) { + Status s = TestBackupRestore(thread, rand_column_families, rand_keys); + if (!s.ok()) { + VerificationAbort(shared, "Backup/restore gave inconsistent state", + s); + } } } @@ -1211,6 +1224,30 @@ Status StressTest::TestBackupRestore( std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid); std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid); BackupableDBOptions backup_opts(backup_dir); + // For debugging, get info_log from live options + backup_opts.info_log = db_->GetDBOptions().info_log.get(); + assert(backup_opts.info_log); + if (thread->rand.OneIn(2)) { + backup_opts.file_checksum_gen_factory = options_.file_checksum_gen_factory; + } + if (thread->rand.OneIn(10)) { + backup_opts.share_table_files = false; + } else { + backup_opts.share_table_files = true; + if (thread->rand.OneIn(5)) { + backup_opts.share_files_with_checksum = false; + } else { + backup_opts.share_files_with_checksum = true; + if (thread->rand.OneIn(2)) { + // old + backup_opts.share_files_with_checksum_naming = kChecksumAndFileSize; + } else { + // new + backup_opts.share_files_with_checksum_naming = + kOptionalChecksumAndDbSessionId; + } + } + } BackupEngine* backup_engine = nullptr; Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine); if (s.ok()) { @@ -1221,12 +1258,31 @@ Status StressTest::TestBackupRestore( backup_engine = nullptr; s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine); } + std::vector backup_info; if (s.ok()) { - s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */, - restore_dir /* wal_dir */); + backup_engine->GetBackupInfo(&backup_info); + if (backup_info.empty()) { + s = Status::NotFound("no backups found"); + } + } + if (s.ok() && thread->rand.OneIn(2)) { + s = backup_engine->VerifyBackup( + backup_info.front().backup_id, + thread->rand.OneIn(2) /* verify_with_checksum */); + } + if (s.ok()) { + int count = static_cast(backup_info.size()); + s = backup_engine->RestoreDBFromBackup( + RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id, + restore_dir /* db_dir */, restore_dir /* wal_dir */); } if (s.ok()) { - s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */); + uint32_t to_keep = 0; + if (thread->tid == 0) { + // allow one thread to keep up to 2 backups + to_keep = thread->rand.Uniform(3); + } + s = backup_engine->PurgeOldBackups(to_keep); } DB* restored_db = nullptr; std::vector restored_cf_handles; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 1e4b57480..0f7239b67 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -29,6 +29,9 @@ expected_values_file = tempfile.NamedTemporaryFile() default_params = { "acquire_snapshot_one_in": 10000, + "backup_max_size": 100 * 1024 * 1024, + # Consider larger number when backups considered more stable + "backup_one_in": 100000, "block_size": 16384, "bloom_bits": lambda: random.choice([random.randint(0,19), random.lognormvariate(2.3, 1.3)]), -- GitLab