From d00e5de7fc85a3696642d14ba57568f4c490b810 Mon Sep 17 00:00:00 2001 From: Wez Furlong Date: Wed, 29 Aug 2018 20:24:17 -0700 Subject: [PATCH] use atomic O_CLOEXEC when available (#4328) Summary: In our application we spawn helper child processes concurrently with opening rocksdb. In one situation I observed that the child process had inherited the rocksdb lock file as well as directory handles to the rocksdb storage location. The code in env_posix takes care to set CLOEXEC but doesn't use `O_CLOEXEC` at the time that the files are opened which means that there is a window of opportunity to leak the descriptors across a fork/exec boundary. This diff introduces a helper that can conditionally set the `O_CLOEXEC` bit for the open call using the same logic as that in the existing helper for setting that flag post-open. I've preserved the post-open logic for systems that don't have `O_CLOEXEC`. I've introduced setting `O_CLOEXEC` for what appears to be a number of temporary or transient files and directory handles; I suspect that none of the files opened by Rocks are intended to be inherited by a forked child process. In one case, `fopen` is used to open a file. I've added the use of the glibc-specific `e` mode to turn on `O_CLOEXEC` for this case. While this doesn't cover all posix systems, it is an improvement for our common deployment system. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4328 Reviewed By: ajkr Differential Revision: D9553046 Pulled By: wez fbshipit-source-id: acdb89f7a85ca649b22fe3c3bd76f82142bec2bf --- env/env_posix.cc | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index ae8088f65..db28a2f1a 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -102,6 +102,18 @@ class PosixFileLock : public FileLock { std::string filename; }; +int cloexec_flags(int flags, const EnvOptions* options) { + // If the system supports opening the file with cloexec enabled, + // do so, as this avoids a race condition if a db is opened around + // the same time that a child process is forked +#ifdef O_CLOEXEC + if (options == nullptr || options->set_fd_cloexec) { + flags |= O_CLOEXEC; + } +#endif + return flags; +} + class PosixEnv : public Env { public: PosixEnv(); @@ -133,7 +145,7 @@ class PosixEnv : public Env { const EnvOptions& options) override { result->reset(); int fd = -1; - int flags = O_RDONLY; + int flags = cloexec_flags(O_RDONLY, &options); FILE* file = nullptr; if (options.use_direct_reads && !options.use_mmap_reads) { @@ -184,7 +196,8 @@ class PosixEnv : public Env { result->reset(); Status s; int fd; - int flags = O_RDONLY; + int flags = cloexec_flags(O_RDONLY, &options); + if (options.use_direct_reads && !options.use_mmap_reads) { #ifdef ROCKSDB_LITE return Status::IOError(fname, "Direct I/O not supported in RocksDB lite"); @@ -266,6 +279,8 @@ class PosixEnv : public Env { flags |= O_WRONLY; } + flags = cloexec_flags(flags, &options); + do { IOSTATS_TIMER_GUARD(open_nanos); fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); @@ -354,6 +369,8 @@ class PosixEnv : public Env { flags |= O_WRONLY; } + flags = cloexec_flags(flags, &options); + do { IOSTATS_TIMER_GUARD(open_nanos); fd = open(old_fname.c_str(), flags, @@ -415,9 +432,12 @@ class PosixEnv : public Env { unique_ptr* result, const EnvOptions& options) override { int fd = -1; + int flags = cloexec_flags(O_RDWR, &options); + while (fd < 0) { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(fname.c_str(), O_RDWR, GetDBFileMode(allow_non_owner_access_)); + + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); if (fd < 0) { // Error while opening the file if (errno == EINTR) { @@ -437,9 +457,11 @@ class PosixEnv : public Env { unique_ptr* result) override { int fd = -1; Status status; + int flags = cloexec_flags(O_RDWR, nullptr); + while (fd < 0) { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(fname.c_str(), O_RDWR, 0644); + fd = open(fname.c_str(), flags, 0644); if (fd < 0) { // Error while opening the file if (errno == EINTR) { @@ -477,9 +499,10 @@ class PosixEnv : public Env { unique_ptr* result) override { result->reset(); int fd; + int flags = cloexec_flags(0, nullptr); { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(name.c_str(), 0); + fd = open(name.c_str(), flags); } if (fd < 0) { return IOError("While open directory", name, errno); @@ -663,9 +686,11 @@ class PosixEnv : public Env { } int fd; + int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr); + { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + fd = open(fname.c_str(), flags, 0644); } if (fd < 0) { result = IOError("while open a file for lock", fname, errno); @@ -756,7 +781,13 @@ class PosixEnv : public Env { FILE* f; { IOSTATS_TIMER_GUARD(open_nanos); - f = fopen(fname.c_str(), "w"); + f = fopen(fname.c_str(), "w" +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 7) + "e" // glibc extension to enable O_CLOEXEC +#endif +#endif + ); } if (f == nullptr) { result->reset(); -- GitLab