From 4b97812da8a9fd7f5d84b6b4b21ee701a5e9a873 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Sat, 14 Dec 2019 15:17:05 -0800
Subject: [PATCH] Add long-running snapshots to stress tests (#6171)

Summary:
Current implementation holds on to 10% of snapshots for 10x longer, and 1% of snapshots 100x longer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6171

Test Plan:
```
make -j32 crash_test

Differential Revision: D19038399

Pulled By: maysamyabandeh

fbshipit-source-id: 75da2dbb5c47a0b3f37d299b8719e392b73b42c0
---
 db_stress_tool/db_stress_common.h     |  1 +
 db_stress_tool/db_stress_gflags.cc    |  3 +++
 db_stress_tool/db_stress_test_base.cc | 18 +++++++++++++++---
 tools/db_crashtest.py                 |  1 +
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index c02105a18..cb6c36049 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -175,6 +175,7 @@ DECLARE_int32(compact_range_width);
 DECLARE_int32(acquire_snapshot_one_in);
 DECLARE_bool(compare_full_db_state_snapshot);
 DECLARE_uint64(snapshot_hold_ops);
+DECLARE_bool(long_running_snapshots);
 DECLARE_bool(use_multiget);
 DECLARE_int32(readpercent);
 DECLARE_int32(prefixpercent);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index a6aff93c6..b56e0fe1b 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -430,6 +430,9 @@ DEFINE_uint64(snapshot_hold_ops, 0,
               "If non-zero, then releases snapshots N operations after they're "
               "acquired.");
 
+DEFINE_bool(long_running_snapshots, false,
+            "If set, hold on some some snapshots for much longer time.");
+
 DEFINE_bool(use_multiget, false,
             "If set, use the batched MultiGet API for reads");
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 767097ee0..03bb25537 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -682,9 +682,21 @@ void StressTest::OperateDb(ThreadState* thread) {
             snapshot, rand_column_family, column_family->GetName(),
             keystr,   status_at,          value_at,
             key_vec};
-        thread->snapshot_queue.emplace(
-            std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
-            snap_state);
+        uint64_t hold_for = FLAGS_snapshot_hold_ops;
+        if (FLAGS_long_running_snapshots) {
+          // Hold 10% of snapshots for 10x more
+          if (thread->rand.OneIn(10)) {
+            assert(hold_for < port::kMaxInt64 / 10);
+            hold_for *= 10;
+            // Hold 1% of snapshots for 100x more
+            if (thread->rand.OneIn(10)) {
+              assert(hold_for < port::kMaxInt64 / 10);
+              hold_for *= 10;
+            }
+          }
+        }
+        uint64_t release_at = std::min(FLAGS_ops_per_thread - 1, i + hold_for);
+        thread->snapshot_queue.emplace(release_at, snap_state);
       }
       while (!thread->snapshot_queue.empty() &&
              i >= thread->snapshot_queue.front().first) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index ed50b5e80..2508b1036 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -61,6 +61,7 @@ default_params = {
     "recycle_log_file_num": lambda: random.randint(0, 1),
     "reopen": 20,
     "snapshot_hold_ops": 100000,
+    "long_running_snapshots": lambda: random.randint(0, 1),
     "subcompactions": lambda: random.randint(1, 4),
     "target_file_size_base": 2097152,
     "target_file_size_multiplier": 2,
-- 
GitLab