env_posix.cc 39.9 KB
Newer Older
1 2 3 4 5
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
J
jorlow@chromium.org 已提交
6 7 8 9 10
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <deque>
11
#include <set>
J
jorlow@chromium.org 已提交
12 13 14 15 16 17 18
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
19
#include <sys/ioctl.h>
J
jorlow@chromium.org 已提交
20 21
#include <sys/mman.h>
#include <sys/stat.h>
K
kailiu 已提交
22
#ifdef OS_LINUX
A
Abhishek Kona 已提交
23
#include <sys/statfs.h>
K
kailiu 已提交
24
#endif
J
jorlow@chromium.org 已提交
25 26 27 28
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
29 30
#if defined(OS_LINUX)
#include <linux/fs.h>
31
#include <fcntl.h>
32
#endif
J
jorlow@chromium.org 已提交
33 34 35
#if defined(LEVELDB_PLATFORM_ANDROID)
#include <sys/stat.h>
#endif
36 37
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
J
jorlow@chromium.org 已提交
38
#include "port/port.h"
39
#include "util/coding.h"
J
jorlow@chromium.org 已提交
40
#include "util/logging.h"
41
#include "util/posix_logger.h"
42 43
#include "util/random.h"
#include <signal.h>
J
jorlow@chromium.org 已提交
44

K
kailiu 已提交
45 46 47 48 49 50
// Get nano time for mach systems
#ifdef __MACH__
#include <mach/clock.h>
#include <mach/mach.h>
#endif

A
Abhishek Kona 已提交
51 52 53 54 55 56 57 58 59 60
#if !defined(TMPFS_MAGIC)
#define TMPFS_MAGIC 0x01021994
#endif
#if !defined(XFS_SUPER_MAGIC)
#define XFS_SUPER_MAGIC 0x58465342
#endif
#if !defined(EXT4_SUPER_MAGIC)
#define EXT4_SUPER_MAGIC 0xEF53
#endif

K
kailiu 已提交
61 62 63 64 65 66 67 68 69 70
// For non linux platform, the following macros are used only as place
// holder.
#ifndef OS_LINUX
#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
#endif

71 72
// This is only set from db_stress.cc and for testing only.
// If non-zero, kill at various points in source code with probability 1/this
73
int rocksdb_kill_odds = 0;
74

75
namespace rocksdb {
J
jorlow@chromium.org 已提交
76 77 78

namespace {

K
kailiu 已提交
79 80 81 82 83 84 85 86 87 88
// A wrapper for fadvise, if the platform doesn't support fadvise,
// it will simply return Status::NotSupport.
int Fadvise(int fd, off_t offset, size_t len, int advice) {
#ifdef OS_LINUX
  return posix_fadvise(fd, offset, len, advice);
#else
  return 0;  // simply do nothing.
#endif
}

89 90 91 92
// list of pathnames that are locked
static std::set<std::string> lockedFiles;
static port::Mutex mutex_lockedFiles;

93 94 95 96
static Status IOError(const std::string& context, int err_number) {
  return Status::IOError(context, strerror(err_number));
}

97 98
#ifdef NDEBUG
// empty in release build
99
#define TEST_KILL_RANDOM(rocksdb_kill_odds)
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
#else

// Kill the process with probablity 1/odds for testing.
static void TestKillRandom(int odds, const std::string& srcfile,
                           int srcline) {
  time_t curtime = time(nullptr);
  Random r((uint32_t)curtime);

  assert(odds > 0);
  bool crash = r.OneIn(odds);
  if (crash) {
    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
    fflush(stdout);
    kill(getpid(), SIGTERM);
  }
}

// To avoid crashing always at some frequently executed codepaths (during
// kill random test), use this factor to reduce odds
#define REDUCE_ODDS 2
#define REDUCE_ODDS2 4

122 123 124
#define TEST_KILL_RANDOM(rocksdb_kill_odds) {   \
  if (rocksdb_kill_odds > 0) { \
    TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__);     \
125 126 127 128 129
  } \
}

#endif

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
#if defined(OS_LINUX)
namespace {
  static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
    if (max_size < kMaxVarint64Length*3) {
      return 0;
    }

    struct stat buf;
    int result = fstat(fd, &buf);
    if (result == -1) {
      return 0;
    }

    long version = 0;
    result = ioctl(fd, FS_IOC_GETVERSION, &version);
    if (result == -1) {
      return 0;
    }
    uint64_t uversion = (uint64_t)version;

    char* rid = id;
    rid = EncodeVarint64(rid, buf.st_dev);
    rid = EncodeVarint64(rid, buf.st_ino);
    rid = EncodeVarint64(rid, uversion);
    assert(rid >= id);
    return static_cast<size_t>(rid-id);
  }
}
#endif

J
jorlow@chromium.org 已提交
160 161 162 163
class PosixSequentialFile: public SequentialFile {
 private:
  std::string filename_;
  FILE* file_;
164
  int fd_;
165
  bool use_os_buffer_;
J
jorlow@chromium.org 已提交
166 167

 public:
168 169
  PosixSequentialFile(const std::string& fname, FILE* f,
      const EnvOptions& options)
170
      : filename_(fname), file_(f), fd_(fileno(f)),
H
Haobo Xu 已提交
171
        use_os_buffer_(options.use_os_buffer) {
172
  }
J
jorlow@chromium.org 已提交
173 174 175 176 177 178 179 180 181 182 183
  virtual ~PosixSequentialFile() { fclose(file_); }

  virtual Status Read(size_t n, Slice* result, char* scratch) {
    Status s;
    size_t r = fread_unlocked(scratch, 1, n, file_);
    *result = Slice(scratch, r);
    if (r < n) {
      if (feof(file_)) {
        // We leave status as ok if we hit the end of the file
      } else {
        // A partial read with an error: return a non-ok status
184
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
185 186
      }
    }
187
    if (!use_os_buffer_) {
188 189
      // we need to fadvise away the entire range of pages because
      // we do not want readahead pages to be cached.
K
kailiu 已提交
190
      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
191
    }
J
jorlow@chromium.org 已提交
192 193
    return s;
  }
194 195 196

  virtual Status Skip(uint64_t n) {
    if (fseek(file_, n, SEEK_CUR)) {
197
      return IOError(filename_, errno);
198 199 200
    }
    return Status::OK();
  }
201 202

  virtual Status InvalidateCache(size_t offset, size_t length) {
K
kailiu 已提交
203 204 205
#ifndef OS_LINUX
    return Status::OK();
#else
206
    // free OS pages
K
kailiu 已提交
207
    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
208 209 210 211
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
K
kailiu 已提交
212
#endif
213
  }
J
jorlow@chromium.org 已提交
214 215
};

216
// pread() based random-access
J
jorlow@chromium.org 已提交
217 218 219 220
class PosixRandomAccessFile: public RandomAccessFile {
 private:
  std::string filename_;
  int fd_;
221
  bool use_os_buffer_;
J
jorlow@chromium.org 已提交
222 223

 public:
224 225
  PosixRandomAccessFile(const std::string& fname, int fd,
                        const EnvOptions& options)
H
Haobo Xu 已提交
226 227
      : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
    assert(!options.use_mmap_reads);
228
  }
J
jorlow@chromium.org 已提交
229 230 231 232 233 234 235 236 237
  virtual ~PosixRandomAccessFile() { close(fd_); }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
    *result = Slice(scratch, (r < 0) ? 0 : r);
    if (r < 0) {
      // An error: return a non-ok status
238
      s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
239
    }
240
    if (!use_os_buffer_) {
241 242
      // we need to fadvise away the entire range of pages because
      // we do not want readahead pages to be cached.
K
kailiu 已提交
243
      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
244
    }
J
jorlow@chromium.org 已提交
245 246
    return s;
  }
247

K
kailiu 已提交
248
#ifdef OS_LINUX
249
  virtual size_t GetUniqueId(char* id, size_t max_size) const {
250
    return GetUniqueIdFromFile(fd_, id, max_size);
251 252
  }
#endif
253 254 255 256

  virtual void Hint(AccessPattern pattern) {
    switch(pattern) {
      case NORMAL:
K
kailiu 已提交
257
        Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
258 259
        break;
      case RANDOM:
K
kailiu 已提交
260
        Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
261 262
        break;
      case SEQUENTIAL:
K
kailiu 已提交
263
        Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
264 265
        break;
      case WILLNEED:
K
kailiu 已提交
266
        Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
267 268
        break;
      case DONTNEED:
K
kailiu 已提交
269
        Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
270 271 272 273 274 275 276
        break;
      default:
        assert(false);
        break;
    }
  }

277
  virtual Status InvalidateCache(size_t offset, size_t length) {
K
kailiu 已提交
278 279 280
#ifndef OS_LINUX
    return Status::OK();
#else
281
    // free OS pages
K
kailiu 已提交
282
    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
283 284 285 286
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
K
kailiu 已提交
287
#endif
288
  }
J
jorlow@chromium.org 已提交
289 290
};

291 292 293
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
 private:
294
  int fd_;
295 296 297 298 299 300
  std::string filename_;
  void* mmapped_region_;
  size_t length_;

 public:
  // base[0,length-1] contains the mmapped contents of the file.
301 302
  PosixMmapReadableFile(const int fd, const std::string& fname,
                        void* base, size_t length,
303
                        const EnvOptions& options)
304
      : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
K
kailiu 已提交
305
    fd_ = fd_ + 0;  // suppress the warning for used variables
H
Haobo Xu 已提交
306 307
    assert(options.use_mmap_reads);
    assert(options.use_os_buffer);
308
  }
S
Siying Dong 已提交
309 310 311
  virtual ~PosixMmapReadableFile() {
    assert(munmap(mmapped_region_, length_) == 0);
  }
312 313 314 315 316 317 318 319 320 321 322 323

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    if (offset + n > length_) {
      *result = Slice();
      s = IOError(filename_, EINVAL);
    } else {
      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
    }
    return s;
  }
324
  virtual Status InvalidateCache(size_t offset, size_t length) {
K
kailiu 已提交
325 326 327
#ifndef OS_LINUX
    return Status::OK();
#else
328
    // free OS pages
K
kailiu 已提交
329
    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
330 331 332 333
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
K
kailiu 已提交
334
#endif
335
  }
336 337
};

J
jorlow@chromium.org 已提交
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file.  This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class PosixMmapFile : public WritableFile {
 private:
  std::string filename_;
  int fd_;
  size_t page_size_;
  size_t map_size_;       // How much extra memory to map at a time
  char* base_;            // The mapped region
  char* limit_;           // Limit of the mapped region
  char* dst_;             // Where to write next  (in range [base_,limit_])
  char* last_sync_;       // Where have we synced up to
  uint64_t file_offset_;  // Offset of base_ in file

  // Have we done an munmap of unsynced data?
  bool pending_sync_;

  // Roundup x to a multiple of y
  static size_t Roundup(size_t x, size_t y) {
    return ((x + y - 1) / y) * y;
  }

  size_t TruncateToPageBoundary(size_t s) {
    s -= (s & (page_size_ - 1));
    assert((s % page_size_) == 0);
    return s;
  }

368 369
  bool UnmapCurrentRegion() {
    bool result = true;
370
    TEST_KILL_RANDOM(rocksdb_kill_odds);
A
Abhishek Kona 已提交
371
    if (base_ != nullptr) {
J
jorlow@chromium.org 已提交
372 373 374 375
      if (last_sync_ < limit_) {
        // Defer syncing this data until next Sync() call, if any
        pending_sync_ = true;
      }
376 377 378
      if (munmap(base_, limit_ - base_) != 0) {
        result = false;
      }
J
jorlow@chromium.org 已提交
379
      file_offset_ += limit_ - base_;
A
Abhishek Kona 已提交
380 381 382 383
      base_ = nullptr;
      limit_ = nullptr;
      last_sync_ = nullptr;
      dst_ = nullptr;
J
jorlow@chromium.org 已提交
384 385 386 387 388 389

      // Increase the amount we map the next time, but capped at 1MB
      if (map_size_ < (1<<20)) {
        map_size_ *= 2;
      }
    }
390
    return result;
J
jorlow@chromium.org 已提交
391 392
  }

A
Abhishek Kona 已提交
393
  Status MapNewRegion() {
K
kailiu 已提交
394
#ifdef OS_LINUX
A
Abhishek Kona 已提交
395
    assert(base_ == nullptr);
A
Abhishek Kona 已提交
396

397
    TEST_KILL_RANDOM(rocksdb_kill_odds);
A
Abhishek Kona 已提交
398 399 400 401
    int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
    if (alloc_status != 0) {
      return Status::IOError("Error allocating space to file : " + filename_ +
        "Error : " + strerror(alloc_status));
J
jorlow@chromium.org 已提交
402
    }
A
Abhishek Kona 已提交
403

404
    TEST_KILL_RANDOM(rocksdb_kill_odds);
A
Abhishek Kona 已提交
405
    void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
J
jorlow@chromium.org 已提交
406 407
                     fd_, file_offset_);
    if (ptr == MAP_FAILED) {
A
Abhishek Kona 已提交
408
      return Status::IOError("MMap failed on " + filename_);
J
jorlow@chromium.org 已提交
409
    }
410

411
    TEST_KILL_RANDOM(rocksdb_kill_odds);
412

J
jorlow@chromium.org 已提交
413 414 415 416
    base_ = reinterpret_cast<char*>(ptr);
    limit_ = base_ + map_size_;
    dst_ = base_;
    last_sync_ = base_;
A
Abhishek Kona 已提交
417
    return Status::OK();
K
kailiu 已提交
418 419 420
#else
    return Status::NotSupported("This platform doesn't support fallocate()");
#endif
J
jorlow@chromium.org 已提交
421 422 423
  }

 public:
424 425
  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
                const EnvOptions& options)
J
jorlow@chromium.org 已提交
426 427 428 429
      : filename_(fname),
        fd_(fd),
        page_size_(page_size),
        map_size_(Roundup(65536, page_size)),
A
Abhishek Kona 已提交
430 431 432 433
        base_(nullptr),
        limit_(nullptr),
        dst_(nullptr),
        last_sync_(nullptr),
J
jorlow@chromium.org 已提交
434 435 436
        file_offset_(0),
        pending_sync_(false) {
    assert((page_size & (page_size - 1)) == 0);
H
Haobo Xu 已提交
437
    assert(options.use_mmap_writes);
J
jorlow@chromium.org 已提交
438 439 440 441 442 443 444 445 446 447 448 449
  }


  ~PosixMmapFile() {
    if (fd_ >= 0) {
      PosixMmapFile::Close();
    }
  }

  virtual Status Append(const Slice& data) {
    const char* src = data.data();
    size_t left = data.size();
450
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
451
    PrepareWrite(GetFileSize(), left);
J
jorlow@chromium.org 已提交
452 453 454 455 456
    while (left > 0) {
      assert(base_ <= dst_);
      assert(dst_ <= limit_);
      size_t avail = limit_ - dst_;
      if (avail == 0) {
A
Abhishek Kona 已提交
457 458 459 460 461
        if (UnmapCurrentRegion()) {
          Status s = MapNewRegion();
          if (!s.ok()) {
            return s;
          }
462
          TEST_KILL_RANDOM(rocksdb_kill_odds);
463
        }
J
jorlow@chromium.org 已提交
464 465 466 467 468 469 470 471
      }

      size_t n = (left <= avail) ? left : avail;
      memcpy(dst_, src, n);
      dst_ += n;
      src += n;
      left -= n;
    }
472
    TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
473 474 475 476 477 478
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    size_t unused = limit_ - dst_;
479

480
    TEST_KILL_RANDOM(rocksdb_kill_odds);
481

482 483 484
    if (!UnmapCurrentRegion()) {
      s = IOError(filename_, errno);
    } else if (unused > 0) {
J
jorlow@chromium.org 已提交
485 486
      // Trim the extra space at the end of the file
      if (ftruncate(fd_, file_offset_ - unused) < 0) {
487
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
488 489 490
      }
    }

491
    TEST_KILL_RANDOM(rocksdb_kill_odds);
492

J
jorlow@chromium.org 已提交
493 494
    if (close(fd_) < 0) {
      if (s.ok()) {
495
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
496 497 498 499
      }
    }

    fd_ = -1;
A
Abhishek Kona 已提交
500 501
    base_ = nullptr;
    limit_ = nullptr;
J
jorlow@chromium.org 已提交
502 503 504 505
    return s;
  }

  virtual Status Flush() {
506
    TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
507 508 509 510 511 512 513 514
    return Status::OK();
  }

  virtual Status Sync() {
    Status s;

    if (pending_sync_) {
      // Some unmapped data was not synced
515
      TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
516 517
      pending_sync_ = false;
      if (fdatasync(fd_) < 0) {
518
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
519
      }
520
      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
J
jorlow@chromium.org 已提交
521 522 523 524 525 526 527 528
    }

    if (dst_ > last_sync_) {
      // Find the beginnings of the pages that contain the first and last
      // bytes to be synced.
      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
      last_sync_ = dst_;
529
      TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
530
      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
531
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
532
      }
533
      TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
534 535 536 537
    }

    return s;
  }
538 539 540 541 542 543 544

  /**
   * Flush data as well as metadata to stable storage.
   */
  virtual Status Fsync() {
    if (pending_sync_) {
      // Some unmapped data was not synced
545
      TEST_KILL_RANDOM(rocksdb_kill_odds);
546 547 548 549
      pending_sync_ = false;
      if (fsync(fd_) < 0) {
        return IOError(filename_, errno);
      }
550
      TEST_KILL_RANDOM(rocksdb_kill_odds);
551 552 553 554 555
    }
    // This invocation to Sync will not issue the call to
    // fdatasync because pending_sync_ has already been cleared.
    return Sync();
  }
556 557 558 559 560 561 562 563 564 565

  /**
   * Get the size of valid data in the file. This will not match the
   * size that is returned from the filesystem because we use mmap
   * to extend file by map_size every time.
   */
  virtual uint64_t GetFileSize() {
    size_t used = dst_ - base_;
    return file_offset_ + used;
  }
566

567
  virtual Status InvalidateCache(size_t offset, size_t length) {
K
kailiu 已提交
568 569 570
#ifndef OS_LINUX
    return Status::OK();
#else
571
    // free OS pages
K
kailiu 已提交
572
    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
573 574 575 576
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
K
kailiu 已提交
577
#endif
578 579
  }

580
#ifdef OS_LINUX
581
  virtual Status Allocate(off_t offset, off_t len) {
582
    TEST_KILL_RANDOM(rocksdb_kill_odds);
583 584 585 586 587 588
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
589
#endif
J
jorlow@chromium.org 已提交
590 591
};

592 593 594 595 596 597 598
// Use posix write to write data to a file.
class PosixWritableFile : public WritableFile {
 private:
  const std::string filename_;
  int fd_;
  size_t cursize_;      // current size of cached data in buf_
  size_t capacity_;     // max size of buf_
M
Mayank Agarwal 已提交
599
  unique_ptr<char[]> buf_;           // a buffer to cache writes
600 601 602
  uint64_t filesize_;
  bool pending_sync_;
  bool pending_fsync_;
603
  uint64_t last_sync_size_;
H
Haobo Xu 已提交
604
  uint64_t bytes_per_sync_;
605 606

 public:
607 608
  PosixWritableFile(const std::string& fname, int fd, size_t capacity,
                    const EnvOptions& options) :
609 610 611 612 613 614 615
    filename_(fname),
    fd_(fd),
    cursize_(0),
    capacity_(capacity),
    buf_(new char[capacity]),
    filesize_(0),
    pending_sync_(false),
616
    pending_fsync_(false),
H
Haobo Xu 已提交
617 618
    last_sync_size_(0),
    bytes_per_sync_(options.bytes_per_sync) {
H
Haobo Xu 已提交
619
    assert(!options.use_mmap_writes);
620 621 622 623 624 625 626 627 628
  }

  ~PosixWritableFile() {
    if (fd_ >= 0) {
      PosixWritableFile::Close();
    }
  }

  virtual Status Append(const Slice& data) {
629
    const char* src = data.data();
630 631 632 633 634
    size_t left = data.size();
    Status s;
    pending_sync_ = true;
    pending_fsync_ = true;

635
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
636

637
    PrepareWrite(GetFileSize(), left);
638 639 640 641 642 643 644 645 646
    // if there is no space in the cache, then flush
    if (cursize_ + left > capacity_) {
      s = Flush();
      if (!s.ok()) {
        return s;
      }
      // Increase the buffer size, but capped at 1MB
      if (capacity_ < (1<<20)) {
        capacity_ *= 2;
M
Mayank Agarwal 已提交
647
        buf_.reset(new char[capacity_]);
648 649 650 651 652 653 654
      }
      assert(cursize_ == 0);
    }

    // if the write fits into the cache, then write to cache
    // otherwise do a write() syscall to write to OS buffers.
    if (cursize_ + left <= capacity_) {
M
Mayank Agarwal 已提交
655
      memcpy(buf_.get()+cursize_, src, left);
656 657 658
      cursize_ += left;
    } else {
      while (left != 0) {
C
Chip Turner 已提交
659
        ssize_t done = write(fd_, src, left);
660 661 662
        if (done < 0) {
          return IOError(filename_, errno);
        }
663
        TEST_KILL_RANDOM(rocksdb_kill_odds);
664

665 666 667 668 669 670 671 672 673 674 675 676 677
        left -= done;
        src += done;
      }
    }
    filesize_ += data.size();
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    s = Flush(); // flush cache to OS
    if (!s.ok()) {
    }
678

679
    TEST_KILL_RANDOM(rocksdb_kill_odds);
680

681 682 683 684 685 686 687 688 689 690 691
    if (close(fd_) < 0) {
      if (s.ok()) {
        s = IOError(filename_, errno);
      }
    }
    fd_ = -1;
    return s;
  }

  // write out the cached data to the OS cache
  virtual Status Flush() {
692
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
693
    size_t left = cursize_;
M
Mayank Agarwal 已提交
694
    char* src = buf_.get();
695
    while (left != 0) {
C
Chip Turner 已提交
696
      ssize_t done = write(fd_, src, left);
697 698 699
      if (done < 0) {
        return IOError(filename_, errno);
      }
700
      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
701 702 703 704
      left -= done;
      src += done;
    }
    cursize_ = 0;
705

H
Haobo Xu 已提交
706 707 708 709 710 711
    // sync OS cache to disk for every bytes_per_sync_
    // TODO: give log file and sst file different options (log
    // files could be potentially cached in OS for their whole
    // life time, thus we might not want to flush at all).
    if (bytes_per_sync_ &&
        filesize_ - last_sync_size_ >= bytes_per_sync_) {
712 713 714 715
      RangeSync(last_sync_size_, filesize_ - last_sync_size_);
      last_sync_size_ = filesize_;
    }

716 717 718 719
    return Status::OK();
  }

  virtual Status Sync() {
720
    TEST_KILL_RANDOM(rocksdb_kill_odds);
721 722 723
    if (pending_sync_ && fdatasync(fd_) < 0) {
      return IOError(filename_, errno);
    }
724
    TEST_KILL_RANDOM(rocksdb_kill_odds);
725 726 727 728 729
    pending_sync_ = false;
    return Status::OK();
  }

  virtual Status Fsync() {
730
    TEST_KILL_RANDOM(rocksdb_kill_odds);
731 732 733
    if (pending_fsync_ && fsync(fd_) < 0) {
      return IOError(filename_, errno);
    }
734
    TEST_KILL_RANDOM(rocksdb_kill_odds);
735 736 737 738 739 740 741 742
    pending_fsync_ = false;
    pending_sync_ = false;
    return Status::OK();
  }

  virtual uint64_t GetFileSize() {
    return filesize_;
  }
743

744
  virtual Status InvalidateCache(size_t offset, size_t length) {
K
kailiu 已提交
745 746 747
#ifndef OS_LINUX
    return Status::OK();
#else
748
    // free OS pages
K
kailiu 已提交
749
    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
750 751 752 753
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
K
kailiu 已提交
754
#endif
755 756
  }

757
#ifdef OS_LINUX
758
  virtual Status Allocate(off_t offset, off_t len) {
759
    TEST_KILL_RANDOM(rocksdb_kill_odds);
760 761 762 763 764 765
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
766 767 768 769 770 771 772 773

  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
    if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
774 775 776
  virtual size_t GetUniqueId(char* id, size_t max_size) const {
    return GetUniqueIdFromFile(fd_, id, max_size);
  }
777
#endif
778 779
};

780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
class PosixRandomRWFile : public RandomRWFile {
 private:
  const std::string filename_;
  int fd_;
  bool pending_sync_;
  bool pending_fsync_;

 public:
  PosixRandomRWFile(const std::string& fname, int fd,
                    const EnvOptions& options) :
      filename_(fname),
      fd_(fd),
      pending_sync_(false),
      pending_fsync_(false) {
    assert(!options.use_mmap_writes && !options.use_mmap_reads);
  }

  ~PosixRandomRWFile() {
    if (fd_ >= 0) {
      Close();
    }
  }

  virtual Status Write(uint64_t offset, const Slice& data) {
    const char* src = data.data();
    size_t left = data.size();
    Status s;
    pending_sync_ = true;
    pending_fsync_ = true;

    while (left != 0) {
      ssize_t done = pwrite(fd_, src, left, offset);
      if (done < 0) {
        return IOError(filename_, errno);
      }

      left -= done;
      src += done;
      offset += done;
    }

    return Status::OK();
  }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
    *result = Slice(scratch, (r < 0) ? 0 : r);
    if (r < 0) {
      s = IOError(filename_, errno);
    }
    return s;
  }

  virtual Status Close() {
    Status s = Status::OK();
    if (fd_ >= 0 && close(fd_) < 0) {
      s = IOError(filename_, errno);
    }
    fd_ = -1;
    return s;
  }

  virtual Status Sync() {
    if (pending_sync_ && fdatasync(fd_) < 0) {
      return IOError(filename_, errno);
    }
    pending_sync_ = false;
    return Status::OK();
  }

  virtual Status Fsync() {
    if (pending_fsync_ && fsync(fd_) < 0) {
      return IOError(filename_, errno);
    }
    pending_fsync_ = false;
    pending_sync_ = false;
    return Status::OK();
  }

#ifdef OS_LINUX
  virtual Status Allocate(off_t offset, off_t len) {
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
#endif
};

872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893
static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
  mutex_lockedFiles.Lock();
  if (lock) {
    // If it already exists in the lockedFiles set, then it is already locked,
    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
    // This check is needed because fcntl() does not detect lock conflict
    // if the fcntl is issued by the same thread that earlier acquired
    // this lock.
    if (lockedFiles.insert(fname).second == false) {
      mutex_lockedFiles.Unlock();
      errno = ENOLCK;
      return -1;
    }
  } else {
    // If we are unlocking, then verify that we had locked it earlier,
    // it should already exist in lockedFiles. Remove it from lockedFiles.
    if (lockedFiles.erase(fname) != 1) {
      mutex_lockedFiles.Unlock();
      errno = ENOLCK;
      return -1;
    }
  }
J
jorlow@chromium.org 已提交
894 895 896 897 898 899 900
  errno = 0;
  struct flock f;
  memset(&f, 0, sizeof(f));
  f.l_type = (lock ? F_WRLCK : F_UNLCK);
  f.l_whence = SEEK_SET;
  f.l_start = 0;
  f.l_len = 0;        // Lock/unlock entire file
901 902 903 904 905 906 907
  int value = fcntl(fd, F_SETLK, &f);
  if (value == -1 && lock) {
    // if there is an error in locking, then remove the pathname from lockedfiles
    lockedFiles.erase(fname);
  }
  mutex_lockedFiles.Unlock();
  return value;
J
jorlow@chromium.org 已提交
908 909 910 911 912
}

class PosixFileLock : public FileLock {
 public:
  int fd_;
913
  std::string filename;
J
jorlow@chromium.org 已提交
914 915
};

916 917 918 919 920 921 922 923 924 925

namespace {
void PthreadCall(const char* label, int result) {
  if (result != 0) {
    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
    exit(1);
  }
}
}

J
jorlow@chromium.org 已提交
926 927 928
class PosixEnv : public Env {
 public:
  PosixEnv();
929 930

  virtual ~PosixEnv(){
931 932 933
    for (const auto tid : threads_to_join_) {
      pthread_join(tid, nullptr);
    }
J
jorlow@chromium.org 已提交
934 935
  }

936
  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
H
Haobo Xu 已提交
937
    if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
938 939 940 941
      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
    }
  }

J
jorlow@chromium.org 已提交
942
  virtual Status NewSequentialFile(const std::string& fname,
943 944
                                   unique_ptr<SequentialFile>* result,
                                   const EnvOptions& options) {
945
    result->reset();
J
jorlow@chromium.org 已提交
946
    FILE* f = fopen(fname.c_str(), "r");
A
Abhishek Kona 已提交
947 948
    if (f == nullptr) {
      *result = nullptr;
949
      return IOError(fname, errno);
J
jorlow@chromium.org 已提交
950
    } else {
951 952
      int fd = fileno(f);
      SetFD_CLOEXEC(fd, &options);
953
      result->reset(new PosixSequentialFile(fname, f, options));
J
jorlow@chromium.org 已提交
954 955 956 957 958
      return Status::OK();
    }
  }

  virtual Status NewRandomAccessFile(const std::string& fname,
959 960
                                     unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& options) {
961
    result->reset();
962
    Status s;
J
jorlow@chromium.org 已提交
963
    int fd = open(fname.c_str(), O_RDONLY);
964
    SetFD_CLOEXEC(fd, &options);
J
jorlow@chromium.org 已提交
965
    if (fd < 0) {
966
      s = IOError(fname, errno);
H
Haobo Xu 已提交
967
    } else if (options.use_mmap_reads && sizeof(void*) >= 8) {
968 969 970 971 972 973
      // Use of mmap for random reads has been removed because it
      // kills performance when storage is fast.
      // Use mmap when virtual address-space is plentiful.
      uint64_t size;
      s = GetFileSize(fname, &size);
      if (s.ok()) {
A
Abhishek Kona 已提交
974
        void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
975
        if (base != MAP_FAILED) {
976 977
          result->reset(new PosixMmapReadableFile(fd, fname, base,
                                                  size, options));
978 979 980 981 982
        } else {
          s = IOError(fname, errno);
        }
      }
      close(fd);
983
    } else {
984
      result->reset(new PosixRandomAccessFile(fname, fd, options));
J
jorlow@chromium.org 已提交
985
    }
986
    return s;
J
jorlow@chromium.org 已提交
987 988 989
  }

  virtual Status NewWritableFile(const std::string& fname,
990 991
                                 unique_ptr<WritableFile>* result,
                                 const EnvOptions& options) {
992
    result->reset();
J
jorlow@chromium.org 已提交
993 994 995
    Status s;
    const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
    if (fd < 0) {
996
      s = IOError(fname, errno);
J
jorlow@chromium.org 已提交
997
    } else {
998
      SetFD_CLOEXEC(fd, &options);
H
Haobo Xu 已提交
999
      if (options.use_mmap_writes) {
1000 1001
        if (!checkedDiskForMmap_) {
          // this will be executed once in the program's lifetime.
A
Abhishek Kona 已提交
1002
          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
1003 1004 1005 1006
          if (!SupportsFastAllocate(fname)) {
            forceMmapOff = true;
          }
          checkedDiskForMmap_ = true;
A
Abhishek Kona 已提交
1007 1008
        }
      }
H
Haobo Xu 已提交
1009
      if (options.use_mmap_writes && !forceMmapOff) {
1010
        result->reset(new PosixMmapFile(fname, fd, page_size_, options));
1011
      } else {
K
kailiu 已提交
1012 1013 1014 1015 1016 1017 1018
        // disable mmap writes
        EnvOptions no_mmap_writes_options = options;
        no_mmap_writes_options.use_mmap_writes = false;

        result->reset(
            new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options)
        );
1019
      }
J
jorlow@chromium.org 已提交
1020 1021 1022 1023
    }
    return s;
  }

1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
  virtual Status NewRandomRWFile(const std::string& fname,
                                 unique_ptr<RandomRWFile>* result,
                                 const EnvOptions& options) {
    result->reset();
    Status s;
    const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
    if (fd < 0) {
      s = IOError(fname, errno);
    } else {
      SetFD_CLOEXEC(fd, &options);
      // no support for mmap yet
      if (options.use_mmap_writes || options.use_mmap_reads) {
        return Status::NotSupported("No support for mmap read/write yet");
      }
      result->reset(new PosixRandomRWFile(fname, fd, options));
    }
    return s;
  }

J
jorlow@chromium.org 已提交
1043 1044 1045 1046 1047 1048 1049 1050
  virtual bool FileExists(const std::string& fname) {
    return access(fname.c_str(), F_OK) == 0;
  }

  virtual Status GetChildren(const std::string& dir,
                             std::vector<std::string>* result) {
    result->clear();
    DIR* d = opendir(dir.c_str());
A
Abhishek Kona 已提交
1051
    if (d == nullptr) {
1052
      return IOError(dir, errno);
J
jorlow@chromium.org 已提交
1053 1054
    }
    struct dirent* entry;
A
Abhishek Kona 已提交
1055
    while ((entry = readdir(d)) != nullptr) {
J
jorlow@chromium.org 已提交
1056 1057 1058 1059 1060 1061 1062 1063 1064
      result->push_back(entry->d_name);
    }
    closedir(d);
    return Status::OK();
  }

  virtual Status DeleteFile(const std::string& fname) {
    Status result;
    if (unlink(fname.c_str()) != 0) {
1065
      result = IOError(fname, errno);
J
jorlow@chromium.org 已提交
1066 1067 1068 1069 1070 1071 1072
    }
    return result;
  };

  virtual Status CreateDir(const std::string& name) {
    Status result;
    if (mkdir(name.c_str(), 0755) != 0) {
1073
      result = IOError(name, errno);
J
jorlow@chromium.org 已提交
1074 1075 1076 1077
    }
    return result;
  };

1078 1079 1080 1081 1082
  virtual Status CreateDirIfMissing(const std::string& name) {
    Status result;
    if (mkdir(name.c_str(), 0755) != 0) {
      if (errno != EEXIST) {
        result = IOError(name, errno);
1083 1084 1085 1086
      } else if (!DirExists(name)) { // Check that name is actually a
                                     // directory.
        // Message is taken from mkdir
        result = Status::IOError("`"+name+"' exists but is not a directory");
1087 1088 1089 1090 1091
      }
    }
    return result;
  };

J
jorlow@chromium.org 已提交
1092 1093 1094
  virtual Status DeleteDir(const std::string& name) {
    Status result;
    if (rmdir(name.c_str()) != 0) {
1095
      result = IOError(name, errno);
J
jorlow@chromium.org 已提交
1096 1097 1098 1099 1100 1101 1102 1103 1104
    }
    return result;
  };

  virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
    Status s;
    struct stat sbuf;
    if (stat(fname.c_str(), &sbuf) != 0) {
      *size = 0;
1105
      s = IOError(fname, errno);
J
jorlow@chromium.org 已提交
1106 1107 1108 1109 1110 1111
    } else {
      *size = sbuf.st_size;
    }
    return s;
  }

1112 1113 1114 1115 1116 1117 1118 1119 1120
  virtual Status GetFileModificationTime(const std::string& fname,
                                         uint64_t* file_mtime) {
    struct stat s;
    if (stat(fname.c_str(), &s) !=0) {
      return IOError(fname, errno);
    }
    *file_mtime = static_cast<uint64_t>(s.st_mtime);
    return Status::OK();
  }
J
jorlow@chromium.org 已提交
1121 1122 1123
  virtual Status RenameFile(const std::string& src, const std::string& target) {
    Status result;
    if (rename(src.c_str(), target.c_str()) != 0) {
1124
      result = IOError(src, errno);
J
jorlow@chromium.org 已提交
1125 1126 1127 1128 1129
    }
    return result;
  }

  virtual Status LockFile(const std::string& fname, FileLock** lock) {
A
Abhishek Kona 已提交
1130
    *lock = nullptr;
J
jorlow@chromium.org 已提交
1131 1132 1133
    Status result;
    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
1134
      result = IOError(fname, errno);
1135
    } else if (LockOrUnlock(fname, fd, true) == -1) {
1136
      result = IOError("lock " + fname, errno);
J
jorlow@chromium.org 已提交
1137 1138
      close(fd);
    } else {
1139
      SetFD_CLOEXEC(fd, nullptr);
J
jorlow@chromium.org 已提交
1140 1141
      PosixFileLock* my_lock = new PosixFileLock;
      my_lock->fd_ = fd;
1142
      my_lock->filename = fname;
J
jorlow@chromium.org 已提交
1143 1144 1145 1146 1147 1148 1149 1150
      *lock = my_lock;
    }
    return result;
  }

  virtual Status UnlockFile(FileLock* lock) {
    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
    Status result;
1151
    if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
1152
      result = IOError("unlock", errno);
J
jorlow@chromium.org 已提交
1153 1154 1155 1156 1157 1158
    }
    close(my_lock->fd_);
    delete my_lock;
    return result;
  }

1159
  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW);
1160

J
jorlow@chromium.org 已提交
1161 1162 1163 1164 1165 1166 1167 1168
  virtual void StartThread(void (*function)(void* arg), void* arg);

  virtual Status GetTestDirectory(std::string* result) {
    const char* env = getenv("TEST_TMPDIR");
    if (env && env[0] != '\0') {
      *result = env;
    } else {
      char buf[100];
1169
      snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
J
jorlow@chromium.org 已提交
1170 1171 1172 1173 1174 1175 1176
      *result = buf;
    }
    // Directory may already exist
    CreateDir(*result);
    return Status::OK();
  }

1177
  static uint64_t gettid() {
J
jorlow@chromium.org 已提交
1178 1179
    pthread_t tid = pthread_self();
    uint64_t thread_id = 0;
J
jorlow@chromium.org 已提交
1180
    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
1181 1182
    return thread_id;
  }
J
jorlow@chromium.org 已提交
1183

1184 1185
  virtual Status NewLogger(const std::string& fname,
                           shared_ptr<Logger>* result) {
1186
    FILE* f = fopen(fname.c_str(), "w");
A
Abhishek Kona 已提交
1187
    if (f == nullptr) {
1188
      result->reset();
1189 1190
      return IOError(fname, errno);
    } else {
1191 1192
      int fd = fileno(f);
      SetFD_CLOEXEC(fd, nullptr);
I
Igor Canadi 已提交
1193
      result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
1194
      return Status::OK();
J
jorlow@chromium.org 已提交
1195 1196 1197 1198 1199
    }
  }

  virtual uint64_t NowMicros() {
    struct timeval tv;
K
kailiu 已提交
1200
    // TODO(kailiu) MAC DON'T HAVE THIS
A
Abhishek Kona 已提交
1201
    gettimeofday(&tv, nullptr);
J
jorlow@chromium.org 已提交
1202 1203 1204
    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
  }

1205
  virtual uint64_t NowNanos() {
K
kailiu 已提交
1206
#ifdef OS_LINUX
1207 1208 1209
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
K
kailiu 已提交
1210 1211 1212 1213 1214 1215 1216 1217
#elif __MACH__
    clock_serv_t cclock;
    mach_timespec_t ts;
    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
    clock_get_time(cclock, &ts);
    mach_port_deallocate(mach_task_self(), cclock);
#endif
    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
1218 1219
  }

J
jorlow@chromium.org 已提交
1220 1221 1222 1223
  virtual void SleepForMicroseconds(int micros) {
    usleep(micros);
  }

1224
  virtual Status GetHostName(char* name, uint64_t len) {
1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
    int ret = gethostname(name, len);
    if (ret < 0) {
      if (errno == EFAULT || errno == EINVAL)
        return Status::InvalidArgument(strerror(errno));
      else
        return IOError("GetHostName", errno);
    }
    return Status::OK();
  }

  virtual Status GetCurrentTime(int64_t* unix_time) {
A
Abhishek Kona 已提交
1236
    time_t ret = time(nullptr);
1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
    if (ret == (time_t) -1) {
      return IOError("GetCurrentTime", errno);
    }
    *unix_time = (int64_t) ret;
    return Status::OK();
  }

  virtual Status GetAbsolutePath(const std::string& db_path,
      std::string* output_path) {
    if (db_path.find('/') == 0) {
      *output_path = db_path;
      return Status::OK();
    }

    char the_path[256];
    char* ret = getcwd(the_path, 256);
A
Abhishek Kona 已提交
1253
    if (ret == nullptr) {
1254 1255 1256 1257 1258 1259 1260
      return Status::IOError(strerror(errno));
    }

    *output_path = ret;
    return Status::OK();
  }

A
Abhishek Kona 已提交
1261
  // Allow increasing the number of worker threads.
1262 1263 1264
  virtual void SetBackgroundThreads(int num, Priority pri) {
    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
    thread_pools_[pri].SetBackgroundThreads(num);
1265 1266
  }

1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
  virtual std::string TimeToString(uint64_t secondsSince1970) {
    const time_t seconds = (time_t)secondsSince1970;
    struct tm t;
    int maxsize = 64;
    std::string dummy;
    dummy.reserve(maxsize);
    dummy.resize(maxsize);
    char* p = &dummy[0];
    localtime_r(&seconds, &t);
    snprintf(p, maxsize,
             "%04d/%02d/%02d-%02d:%02d:%02d ",
             t.tm_year + 1900,
             t.tm_mon + 1,
             t.tm_mday,
             t.tm_hour,
             t.tm_min,
             t.tm_sec);
    return dummy;
  }

J
jorlow@chromium.org 已提交
1287
 private:
1288 1289
  bool checkedDiskForMmap_;
  bool forceMmapOff; // do we override Env options?
A
Abhishek Kona 已提交
1290

J
jorlow@chromium.org 已提交
1291

1292 1293 1294 1295 1296 1297 1298 1299 1300
  // Returns true iff the named directory exists and is a directory.
  virtual bool DirExists(const std::string& dname) {
    struct stat statbuf;
    if (stat(dname.c_str(), &statbuf) == 0) {
      return S_ISDIR(statbuf.st_mode);
    }
    return false; // stat() failed return false
  }

A
Abhishek Kona 已提交
1301
  bool SupportsFastAllocate(const std::string& path) {
K
kailiu 已提交
1302
#ifdef OS_LINUX
A
Abhishek Kona 已提交
1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
    struct statfs s;
    if (statfs(path.c_str(), &s)){
      return false;
    }
    switch (s.f_type) {
      case EXT4_SUPER_MAGIC:
        return true;
      case XFS_SUPER_MAGIC:
        return true;
      case TMPFS_MAGIC:
        return true;
      default:
        return false;
    }
K
kailiu 已提交
1317 1318 1319
#else
    return false;
#endif
A
Abhishek Kona 已提交
1320 1321
  }

J
jorlow@chromium.org 已提交
1322 1323 1324
  size_t page_size_;


1325 1326
  class ThreadPool {
   public:
1327

1328 1329 1330 1331 1332 1333 1334 1335
    ThreadPool() :
        total_threads_limit_(1),
        bgthreads_(0),
        queue_(),
        exit_all_threads_(false) {
      PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
      PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
    }
J
jorlow@chromium.org 已提交
1336

1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
    ~ThreadPool() {
      PthreadCall("lock", pthread_mutex_lock(&mu_));
      assert(!exit_all_threads_);
      exit_all_threads_ = true;
      PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
      for (const auto tid : bgthreads_) {
        pthread_join(tid, nullptr);
      }
    }
J
jorlow@chromium.org 已提交
1347

1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
    void BGThread() {
      while (true) {
        // Wait until there is an item that is ready to run
        PthreadCall("lock", pthread_mutex_lock(&mu_));
        while (queue_.empty() && !exit_all_threads_) {
          PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
        }
        if (exit_all_threads_) { // mechanism to let BG threads exit safely
          PthreadCall("unlock", pthread_mutex_unlock(&mu_));
          break;
        }
        void (*function)(void*) = queue_.front().function;
        void* arg = queue_.front().arg;
        queue_.pop_front();
H
Haobo Xu 已提交
1362

1363 1364 1365 1366
        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
        (*function)(arg);
      }
    }
J
jorlow@chromium.org 已提交
1367

1368 1369 1370 1371
    static void* BGThreadWrapper(void* arg) {
      reinterpret_cast<ThreadPool*>(arg)->BGThread();
      return nullptr;
    }
J
jorlow@chromium.org 已提交
1372

1373 1374 1375 1376 1377 1378 1379
    void SetBackgroundThreads(int num) {
      PthreadCall("lock", pthread_mutex_lock(&mu_));
      if (num > total_threads_limit_) {
        total_threads_limit_ = num;
      }
      assert(total_threads_limit_ > 0);
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
J
jorlow@chromium.org 已提交
1380
    }
1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397

    void Schedule(void (*function)(void*), void* arg) {
      PthreadCall("lock", pthread_mutex_lock(&mu_));

      if (exit_all_threads_) {
        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
        return;
      }
      // Start background thread if necessary
      while ((int)bgthreads_.size() < total_threads_limit_) {
        pthread_t t;
        PthreadCall(
          "create thread",
          pthread_create(&t,
                         nullptr,
                         &ThreadPool::BGThreadWrapper,
                         this));
K
kailiu 已提交
1398 1399 1400
        fprintf(stdout,
                "Created bg thread 0x%lx\n",
                (unsigned long)t);
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411
        bgthreads_.push_back(t);
      }

      // Add to priority queue
      queue_.push_back(BGItem());
      queue_.back().function = function;
      queue_.back().arg = arg;

      // always wake up at least one waiting thread.
      PthreadCall("signal", pthread_cond_signal(&bgsignal_));

1412 1413
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
    }
J
jorlow@chromium.org 已提交
1414

1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444
   private:
    // Entry per Schedule() call
    struct BGItem { void* arg; void (*function)(void*); };
    typedef std::deque<BGItem> BGQueue;

    pthread_mutex_t mu_;
    pthread_cond_t bgsignal_;
    int total_threads_limit_;
    std::vector<pthread_t> bgthreads_;
    BGQueue queue_;
    bool exit_all_threads_;
  };

  std::vector<ThreadPool> thread_pools_;

  pthread_mutex_t mu_;
  std::vector<pthread_t> threads_to_join_;

};

PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
                       forceMmapOff(false),
                       page_size_(getpagesize()),
                       thread_pools_(Priority::TOTAL) {
  PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
}

void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
  thread_pools_[pri].Schedule(function, arg);
J
jorlow@chromium.org 已提交
1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
}

namespace {
struct StartThreadState {
  void (*user_function)(void*);
  void* arg;
};
}
static void* StartThreadWrapper(void* arg) {
  StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
  state->user_function(state->arg);
  delete state;
A
Abhishek Kona 已提交
1457
  return nullptr;
J
jorlow@chromium.org 已提交
1458 1459 1460 1461 1462 1463 1464 1465
}

void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
  pthread_t t;
  StartThreadState* state = new StartThreadState;
  state->user_function = function;
  state->arg = arg;
  PthreadCall("start thread",
A
Abhishek Kona 已提交
1466
              pthread_create(&t, nullptr,  &StartThreadWrapper, state));
1467
  PthreadCall("lock", pthread_mutex_lock(&mu_));
1468
  threads_to_join_.push_back(t);
1469
  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
J
jorlow@chromium.org 已提交
1470 1471
}

H
Hans Wennborg 已提交
1472
}  // namespace
J
jorlow@chromium.org 已提交
1473

M
Mayank Agarwal 已提交
1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488
std::string Env::GenerateUniqueId() {
  std::string uuid_file = "/proc/sys/kernel/random/uuid";
  if (FileExists(uuid_file)) {
    std::string uuid;
    Status s = ReadFileToString(this, uuid_file, &uuid);
    if (s.ok()) {
      return uuid;
    }
  }
  // Could not read uuid_file - generate uuid using "nanos-random"
  Random64 r(time(nullptr));
  uint64_t random_uuid_portion =
    r.Uniform(std::numeric_limits<uint64_t>::max());
  uint64_t nanos_uuid_portion = NowNanos();
  char uuid2[200];
K
kailiu 已提交
1489 1490 1491 1492 1493
  snprintf(uuid2,
           200,
           "%lx-%lx",
           (unsigned long)nanos_uuid_portion,
           (unsigned long)random_uuid_portion);
M
Mayank Agarwal 已提交
1494 1495 1496
  return uuid2;
}

J
jorlow@chromium.org 已提交
1497
Env* Env::Default() {
1498
  static PosixEnv default_env;
1499
  return &default_env;
J
jorlow@chromium.org 已提交
1500 1501
}

1502
}  // namespace rocksdb