From c75ef03e589a1ca8e20ed042229b1a033a3ef43c Mon Sep 17 00:00:00 2001 From: "Peter (Stig) Edwards" Date: Thu, 27 May 2021 10:24:06 -0700 Subject: [PATCH] Do not truncate WAL if in read_only mode (#8313) Summary: I noticed ```openat``` system call with ```O_WRONLY``` flag and ```sync_file_range``` and ```truncate``` on WAL file when using ```rocksdb::DB::OpenForReadOnly``` by way of ```db_bench --readonly=true --benchmarks=readseq --use_existing_db=1 --num=1 ...``` Noticed in ```strace``` after seeing the last modification time of the WAL file change after each run (with ```--readonly=true```). I think introduced by https://github.com/facebook/rocksdb/commit/7d7f14480e135a4939ed6903f46b3f7056aa837a from https://github.com/facebook/rocksdb/pull/8122 I added a test to catch the WAL file being truncated and the modification time on it changing. I am not sure if a mock filesystem with mock clock could be used to avoid having to sleep 1.1s. The test could also check the set of files is the same and that the sizes are also unchanged. Before: ``` [ RUN ] DBBasicTest.ReadOnlyReopenMtimeUnchanged db/db_basic_test.cc:182: Failure Expected equality of these values: file_mtime_after_readonly_reopen Which is: 1621611136 file_mtime_before_readonly_reopen Which is: 1621611135 file is: 000010.log [ FAILED ] DBBasicTest.ReadOnlyReopenMtimeUnchanged (1108 ms) ``` After: ``` [ RUN ] DBBasicTest.ReadOnlyReopenMtimeUnchanged [ OK ] DBBasicTest.ReadOnlyReopenMtimeUnchanged (1108 ms) ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/8313 Reviewed By: pdillinger Differential Revision: D28656925 Pulled By: jay-zhuang fbshipit-source-id: ea9e215cb53e7c830e76bc5fc75c45e21f12a1d6 --- db/db_impl/db_impl_open.cc | 3 ++- db/db_wal_test.cc | 55 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index a57675c4d..9ed014f65 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1259,7 +1259,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, // If there's no data in the WAL, or we flushed all the data, still // truncate the log file. If the process goes into a crash loop before // the file is deleted, the preallocated space will never get freed. - GetLogSizeAndMaybeTruncate(wal_numbers.back(), true, nullptr) + const bool truncate = !read_only; + GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr) .PermitUncheckedError(); } } diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 8aaa83cd1..2a6f65856 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -2001,6 +2001,61 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } + +TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + // create DB and close with file truncate disabled + std::atomic_bool enable_truncate{false}; + + SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", [&](void* arg) { + if (!enable_truncate) { + *(reinterpret_cast(arg)) = 0; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName()); + ASSERT_GE(db_size, preallocated_size); + Close(); + + // enable truncate and open DB as readonly, the file should not be truncated + // and DB size is not changed. + enable_truncate = true; + ASSERT_OK(ReadOnlyReopen(options)); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName()); + // The preallocated space should NOT be truncated. + // the DB size is almost the same. + ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size, + db_size / 100); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} #endif // ROCKSDB_FALLOCATE_PRESENT #endif // ROCKSDB_PLATFORM_POSIX -- GitLab