env_posix.cc 31.9 KB
Newer Older
J
jorlow@chromium.org 已提交
1 2 3 4 5
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <deque>
6
#include <set>
J
jorlow@chromium.org 已提交
7 8 9 10 11 12 13
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
14
#include <sys/ioctl.h>
J
jorlow@chromium.org 已提交
15 16
#include <sys/mman.h>
#include <sys/stat.h>
A
Abhishek Kona 已提交
17
#include <sys/statfs.h>
J
jorlow@chromium.org 已提交
18 19
#include <sys/time.h>
#include <sys/types.h>
A
Abhishek Kona 已提交
20
#include <sys/vfs.h>
J
jorlow@chromium.org 已提交
21 22
#include <time.h>
#include <unistd.h>
23 24 25
#if defined(OS_LINUX)
#include <linux/fs.h>
#endif
J
jorlow@chromium.org 已提交
26 27 28
#if defined(LEVELDB_PLATFORM_ANDROID)
#include <sys/stat.h>
#endif
29 30
#include "leveldb/env.h"
#include "leveldb/slice.h"
J
jorlow@chromium.org 已提交
31
#include "port/port.h"
32
#include "util/coding.h"
J
jorlow@chromium.org 已提交
33
#include "util/logging.h"
34
#include "util/posix_logger.h"
35 36
#include "util/random.h"
#include <signal.h>
J
jorlow@chromium.org 已提交
37

A
Abhishek Kona 已提交
38 39 40 41 42 43 44 45 46 47
#if !defined(TMPFS_MAGIC)
#define TMPFS_MAGIC 0x01021994
#endif
#if !defined(XFS_SUPER_MAGIC)
#define XFS_SUPER_MAGIC 0x58465342
#endif
#if !defined(EXT4_SUPER_MAGIC)
#define EXT4_SUPER_MAGIC 0xEF53
#endif

A
Abhishek Kona 已提交
48
bool useOsBuffer = 1;     // cache data in OS buffers
49
bool useFsReadAhead = 1;  // allow filesystem to do readaheads
50 51
bool useMmapRead = 0;     // do not use mmaps for reading files
bool useMmapWrite = 1;    // use mmaps for appending to files
52

53 54 55 56
// This is only set from db_stress.cc and for testing only.
// If non-zero, kill at various points in source code with probability 1/this
int leveldb_kill_odds = 0;

J
jorlow@chromium.org 已提交
57 58
namespace leveldb {

59

J
jorlow@chromium.org 已提交
60 61
namespace {

62 63 64 65
// list of pathnames that are locked
static std::set<std::string> lockedFiles;
static port::Mutex mutex_lockedFiles;

66 67 68 69
static Status IOError(const std::string& context, int err_number) {
  return Status::IOError(context, strerror(err_number));
}

70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
#ifdef NDEBUG
// empty in release build
#define TEST_KILL_RANDOM(leveldb_kill_odds)
#else

// Kill the process with probablity 1/odds for testing.
static void TestKillRandom(int odds, const std::string& srcfile,
                           int srcline) {
  time_t curtime = time(nullptr);
  Random r((uint32_t)curtime);

  assert(odds > 0);
  bool crash = r.OneIn(odds);
  if (crash) {
    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
    fflush(stdout);
    kill(getpid(), SIGTERM);
  }
}

// To avoid crashing always at some frequently executed codepaths (during
// kill random test), use this factor to reduce odds
#define REDUCE_ODDS 2
#define REDUCE_ODDS2 4

#define TEST_KILL_RANDOM(leveldb_kill_odds) {   \
  if (leveldb_kill_odds > 0) { \
    TestKillRandom(leveldb_kill_odds, __FILE__, __LINE__);     \
  } \
}

#endif

J
jorlow@chromium.org 已提交
103 104 105 106
class PosixSequentialFile: public SequentialFile {
 private:
  std::string filename_;
  FILE* file_;
107
  int fd_;
108
  bool use_os_buffer_;
J
jorlow@chromium.org 已提交
109 110

 public:
111 112
  PosixSequentialFile(const std::string& fname, FILE* f,
      const EnvOptions& options)
113 114
      : filename_(fname), file_(f), fd_(fileno(f)),
        use_os_buffer_(options.UseOsBuffer()) {
115 116
    assert(!options.UseMmapReads());
  }
J
jorlow@chromium.org 已提交
117 118 119 120 121 122 123 124 125 126 127
  virtual ~PosixSequentialFile() { fclose(file_); }

  virtual Status Read(size_t n, Slice* result, char* scratch) {
    Status s;
    size_t r = fread_unlocked(scratch, 1, n, file_);
    *result = Slice(scratch, r);
    if (r < n) {
      if (feof(file_)) {
        // We leave status as ok if we hit the end of the file
      } else {
        // A partial read with an error: return a non-ok status
128
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
129 130
      }
    }
131
    if (!use_os_buffer_) {
132 133 134 135
      // we need to fadvise away the entire range of pages because
      // we do not want readahead pages to be cached.
      posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
    }
J
jorlow@chromium.org 已提交
136 137
    return s;
  }
138 139 140

  virtual Status Skip(uint64_t n) {
    if (fseek(file_, n, SEEK_CUR)) {
141
      return IOError(filename_, errno);
142 143 144
    }
    return Status::OK();
  }
J
jorlow@chromium.org 已提交
145 146
};

147
// pread() based random-access
J
jorlow@chromium.org 已提交
148 149 150 151
class PosixRandomAccessFile: public RandomAccessFile {
 private:
  std::string filename_;
  int fd_;
152
  bool use_os_buffer_;
J
jorlow@chromium.org 已提交
153 154

 public:
155 156
  PosixRandomAccessFile(const std::string& fname, int fd,
                        const EnvOptions& options)
157
      : filename_(fname), fd_(fd), use_os_buffer_(options.UseOsBuffer()) {
158 159
    assert(!options.UseMmapReads());
    if (!options.UseReadahead()) { // disable read-aheads
160 161 162
      posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM);
    }
  }
J
jorlow@chromium.org 已提交
163 164 165 166 167 168 169 170 171
  virtual ~PosixRandomAccessFile() { close(fd_); }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
    *result = Slice(scratch, (r < 0) ? 0 : r);
    if (r < 0) {
      // An error: return a non-ok status
172
      s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
173
    }
174
    if (!use_os_buffer_) {
175 176 177
      // we need to fadvise away the entire range of pages because
      // we do not want readahead pages to be cached.
      posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
178
    }
J
jorlow@chromium.org 已提交
179 180
    return s;
  }
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209

#if defined(OS_LINUX)
  virtual size_t GetUniqueId(char* id, size_t max_size) const {
    // TODO: possibly allow this function to handle tighter bounds.
    if (max_size < kMaxVarint64Length*3) {
      return 0;
    }

    struct stat buf;
    int result = fstat(fd_, &buf);
    if (result == -1) {
      return 0;
    }

    long version = 0;
    result = ioctl(fd_, FS_IOC_GETVERSION, &version);
    if (result == -1) {
      return 0;
    }
    uint64_t uversion = (uint64_t)version;

    char* rid = id;
    rid = EncodeVarint64(rid, buf.st_dev);
    rid = EncodeVarint64(rid, buf.st_ino);
    rid = EncodeVarint64(rid, uversion);
    assert(rid >= id);
    return static_cast<size_t>(rid-id);
  }
#endif
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

  virtual void Hint(AccessPattern pattern) {
    switch(pattern) {
      case NORMAL:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
        break;
      case RANDOM:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
        break;
      case SEQUENTIAL:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
        break;
      case WILLNEED:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
        break;
      case DONTNEED:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
        break;
      default:
        assert(false);
        break;
    }
  }

J
jorlow@chromium.org 已提交
234 235
};

236 237 238 239 240 241 242 243 244
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
 private:
  std::string filename_;
  void* mmapped_region_;
  size_t length_;

 public:
  // base[0,length-1] contains the mmapped contents of the file.
245 246 247 248 249 250 251
  PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
                        const EnvOptions& options)
      : filename_(fname), mmapped_region_(base), length_(length) {
    assert(options.UseMmapReads());
    assert(options.UseOsBuffer());
    assert(options.UseReadahead());
  }
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
  virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    if (offset + n > length_) {
      *result = Slice();
      s = IOError(filename_, EINVAL);
    } else {
      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
    }
    return s;
  }
};

J
jorlow@chromium.org 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file.  This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class PosixMmapFile : public WritableFile {
 private:
  std::string filename_;
  int fd_;
  size_t page_size_;
  size_t map_size_;       // How much extra memory to map at a time
  char* base_;            // The mapped region
  char* limit_;           // Limit of the mapped region
  char* dst_;             // Where to write next  (in range [base_,limit_])
  char* last_sync_;       // Where have we synced up to
  uint64_t file_offset_;  // Offset of base_ in file

  // Have we done an munmap of unsynced data?
  bool pending_sync_;

  // Roundup x to a multiple of y
  static size_t Roundup(size_t x, size_t y) {
    return ((x + y - 1) / y) * y;
  }

  size_t TruncateToPageBoundary(size_t s) {
    s -= (s & (page_size_ - 1));
    assert((s % page_size_) == 0);
    return s;
  }

297 298
  bool UnmapCurrentRegion() {
    bool result = true;
299
    TEST_KILL_RANDOM(leveldb_kill_odds);
A
Abhishek Kona 已提交
300
    if (base_ != nullptr) {
J
jorlow@chromium.org 已提交
301 302 303 304
      if (last_sync_ < limit_) {
        // Defer syncing this data until next Sync() call, if any
        pending_sync_ = true;
      }
305 306 307
      if (munmap(base_, limit_ - base_) != 0) {
        result = false;
      }
J
jorlow@chromium.org 已提交
308
      file_offset_ += limit_ - base_;
A
Abhishek Kona 已提交
309 310 311 312
      base_ = nullptr;
      limit_ = nullptr;
      last_sync_ = nullptr;
      dst_ = nullptr;
J
jorlow@chromium.org 已提交
313 314 315 316 317 318

      // Increase the amount we map the next time, but capped at 1MB
      if (map_size_ < (1<<20)) {
        map_size_ *= 2;
      }
    }
319
    return result;
J
jorlow@chromium.org 已提交
320 321
  }

A
Abhishek Kona 已提交
322
  Status MapNewRegion() {
A
Abhishek Kona 已提交
323
    assert(base_ == nullptr);
A
Abhishek Kona 已提交
324

325
    TEST_KILL_RANDOM(leveldb_kill_odds);
A
Abhishek Kona 已提交
326 327 328 329
    int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
    if (alloc_status != 0) {
      return Status::IOError("Error allocating space to file : " + filename_ +
        "Error : " + strerror(alloc_status));
J
jorlow@chromium.org 已提交
330
    }
A
Abhishek Kona 已提交
331

332
    TEST_KILL_RANDOM(leveldb_kill_odds);
A
Abhishek Kona 已提交
333
    void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
J
jorlow@chromium.org 已提交
334 335
                     fd_, file_offset_);
    if (ptr == MAP_FAILED) {
A
Abhishek Kona 已提交
336
      return Status::IOError("MMap failed on " + filename_);
J
jorlow@chromium.org 已提交
337
    }
338 339 340

    TEST_KILL_RANDOM(leveldb_kill_odds);

J
jorlow@chromium.org 已提交
341 342 343 344
    base_ = reinterpret_cast<char*>(ptr);
    limit_ = base_ + map_size_;
    dst_ = base_;
    last_sync_ = base_;
A
Abhishek Kona 已提交
345
    return Status::OK();
J
jorlow@chromium.org 已提交
346 347 348
  }

 public:
349 350
  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
                const EnvOptions& options)
J
jorlow@chromium.org 已提交
351 352 353 354
      : filename_(fname),
        fd_(fd),
        page_size_(page_size),
        map_size_(Roundup(65536, page_size)),
A
Abhishek Kona 已提交
355 356 357 358
        base_(nullptr),
        limit_(nullptr),
        dst_(nullptr),
        last_sync_(nullptr),
J
jorlow@chromium.org 已提交
359 360 361
        file_offset_(0),
        pending_sync_(false) {
    assert((page_size & (page_size - 1)) == 0);
362
    assert(options.UseMmapWrites());
J
jorlow@chromium.org 已提交
363 364 365 366 367 368 369 370 371 372 373 374
  }


  ~PosixMmapFile() {
    if (fd_ >= 0) {
      PosixMmapFile::Close();
    }
  }

  virtual Status Append(const Slice& data) {
    const char* src = data.data();
    size_t left = data.size();
375
    TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS);
376
    PrepareWrite(GetFileSize(), left);
J
jorlow@chromium.org 已提交
377 378 379 380 381
    while (left > 0) {
      assert(base_ <= dst_);
      assert(dst_ <= limit_);
      size_t avail = limit_ - dst_;
      if (avail == 0) {
A
Abhishek Kona 已提交
382 383 384 385 386
        if (UnmapCurrentRegion()) {
          Status s = MapNewRegion();
          if (!s.ok()) {
            return s;
          }
387
          TEST_KILL_RANDOM(leveldb_kill_odds);
388
        }
J
jorlow@chromium.org 已提交
389 390 391 392 393 394 395 396
      }

      size_t n = (left <= avail) ? left : avail;
      memcpy(dst_, src, n);
      dst_ += n;
      src += n;
      left -= n;
    }
397
    TEST_KILL_RANDOM(leveldb_kill_odds);
J
jorlow@chromium.org 已提交
398 399 400 401 402 403
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    size_t unused = limit_ - dst_;
404 405 406

    TEST_KILL_RANDOM(leveldb_kill_odds);

407 408 409
    if (!UnmapCurrentRegion()) {
      s = IOError(filename_, errno);
    } else if (unused > 0) {
J
jorlow@chromium.org 已提交
410 411
      // Trim the extra space at the end of the file
      if (ftruncate(fd_, file_offset_ - unused) < 0) {
412
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
413 414 415
      }
    }

416 417
    TEST_KILL_RANDOM(leveldb_kill_odds);

J
jorlow@chromium.org 已提交
418 419
    if (close(fd_) < 0) {
      if (s.ok()) {
420
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
421 422 423 424
      }
    }

    fd_ = -1;
A
Abhishek Kona 已提交
425 426
    base_ = nullptr;
    limit_ = nullptr;
J
jorlow@chromium.org 已提交
427 428 429 430
    return s;
  }

  virtual Status Flush() {
431
    TEST_KILL_RANDOM(leveldb_kill_odds);
J
jorlow@chromium.org 已提交
432 433 434 435 436 437 438 439
    return Status::OK();
  }

  virtual Status Sync() {
    Status s;

    if (pending_sync_) {
      // Some unmapped data was not synced
440
      TEST_KILL_RANDOM(leveldb_kill_odds);
J
jorlow@chromium.org 已提交
441 442
      pending_sync_ = false;
      if (fdatasync(fd_) < 0) {
443
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
444
      }
445
      TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS);
J
jorlow@chromium.org 已提交
446 447 448 449 450 451 452 453
    }

    if (dst_ > last_sync_) {
      // Find the beginnings of the pages that contain the first and last
      // bytes to be synced.
      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
      last_sync_ = dst_;
454
      TEST_KILL_RANDOM(leveldb_kill_odds);
J
jorlow@chromium.org 已提交
455
      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
456
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
457
      }
458
      TEST_KILL_RANDOM(leveldb_kill_odds);
J
jorlow@chromium.org 已提交
459 460 461 462
    }

    return s;
  }
463 464 465 466 467 468 469

  /**
   * Flush data as well as metadata to stable storage.
   */
  virtual Status Fsync() {
    if (pending_sync_) {
      // Some unmapped data was not synced
470
      TEST_KILL_RANDOM(leveldb_kill_odds);
471 472 473 474
      pending_sync_ = false;
      if (fsync(fd_) < 0) {
        return IOError(filename_, errno);
      }
475
      TEST_KILL_RANDOM(leveldb_kill_odds);
476 477 478 479 480
    }
    // This invocation to Sync will not issue the call to
    // fdatasync because pending_sync_ has already been cleared.
    return Sync();
  }
481 482 483 484 485 486 487 488 489 490

  /**
   * Get the size of valid data in the file. This will not match the
   * size that is returned from the filesystem because we use mmap
   * to extend file by map_size every time.
   */
  virtual uint64_t GetFileSize() {
    size_t used = dst_ - base_;
    return file_offset_ + used;
  }
491

492
#ifdef OS_LINUX
493
  virtual Status Allocate(off_t offset, off_t len) {
494
    TEST_KILL_RANDOM(leveldb_kill_odds);
495 496 497 498 499 500
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
501
#endif
J
jorlow@chromium.org 已提交
502 503
};

504 505 506 507 508 509 510
// Use posix write to write data to a file.
class PosixWritableFile : public WritableFile {
 private:
  const std::string filename_;
  int fd_;
  size_t cursize_;      // current size of cached data in buf_
  size_t capacity_;     // max size of buf_
M
Mayank Agarwal 已提交
511
  unique_ptr<char[]> buf_;           // a buffer to cache writes
512 513 514 515 516
  uint64_t filesize_;
  bool pending_sync_;
  bool pending_fsync_;

 public:
517 518
  PosixWritableFile(const std::string& fname, int fd, size_t capacity,
                    const EnvOptions& options) :
519 520 521 522 523 524 525 526
    filename_(fname),
    fd_(fd),
    cursize_(0),
    capacity_(capacity),
    buf_(new char[capacity]),
    filesize_(0),
    pending_sync_(false),
    pending_fsync_(false) {
527
    assert(!options.UseMmapWrites());
528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
  }

  ~PosixWritableFile() {
    if (fd_ >= 0) {
      PosixWritableFile::Close();
    }
  }

  virtual Status Append(const Slice& data) {
    char* src = (char *)data.data();
    size_t left = data.size();
    Status s;
    pending_sync_ = true;
    pending_fsync_ = true;

543 544
    TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);

545
    PrepareWrite(GetFileSize(), left);
546 547 548 549 550 551 552 553 554
    // if there is no space in the cache, then flush
    if (cursize_ + left > capacity_) {
      s = Flush();
      if (!s.ok()) {
        return s;
      }
      // Increase the buffer size, but capped at 1MB
      if (capacity_ < (1<<20)) {
        capacity_ *= 2;
M
Mayank Agarwal 已提交
555
        buf_.reset(new char[capacity_]);
556 557 558 559 560 561 562
      }
      assert(cursize_ == 0);
    }

    // if the write fits into the cache, then write to cache
    // otherwise do a write() syscall to write to OS buffers.
    if (cursize_ + left <= capacity_) {
M
Mayank Agarwal 已提交
563
      memcpy(buf_.get()+cursize_, src, left);
564 565 566
      cursize_ += left;
    } else {
      while (left != 0) {
C
Chip Turner 已提交
567
        ssize_t done = write(fd_, src, left);
568 569 570
        if (done < 0) {
          return IOError(filename_, errno);
        }
571 572
        TEST_KILL_RANDOM(leveldb_kill_odds);

573 574 575 576 577 578 579 580 581 582 583 584 585
        left -= done;
        src += done;
      }
    }
    filesize_ += data.size();
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    s = Flush(); // flush cache to OS
    if (!s.ok()) {
    }
586 587 588

    TEST_KILL_RANDOM(leveldb_kill_odds);

589 590 591 592 593 594 595 596 597 598 599
    if (close(fd_) < 0) {
      if (s.ok()) {
        s = IOError(filename_, errno);
      }
    }
    fd_ = -1;
    return s;
  }

  // write out the cached data to the OS cache
  virtual Status Flush() {
600
    TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
601
    size_t left = cursize_;
M
Mayank Agarwal 已提交
602
    char* src = buf_.get();
603
    while (left != 0) {
C
Chip Turner 已提交
604
      ssize_t done = write(fd_, src, left);
605 606 607
      if (done < 0) {
        return IOError(filename_, errno);
      }
608
      TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
609 610 611 612 613 614 615 616
      left -= done;
      src += done;
    }
    cursize_ = 0;
    return Status::OK();
  }

  virtual Status Sync() {
617
    TEST_KILL_RANDOM(leveldb_kill_odds);
618 619 620
    if (pending_sync_ && fdatasync(fd_) < 0) {
      return IOError(filename_, errno);
    }
621
    TEST_KILL_RANDOM(leveldb_kill_odds);
622 623 624 625 626
    pending_sync_ = false;
    return Status::OK();
  }

  virtual Status Fsync() {
627
    TEST_KILL_RANDOM(leveldb_kill_odds);
628 629 630
    if (pending_fsync_ && fsync(fd_) < 0) {
      return IOError(filename_, errno);
    }
631
    TEST_KILL_RANDOM(leveldb_kill_odds);
632 633 634 635 636 637 638 639
    pending_fsync_ = false;
    pending_sync_ = false;
    return Status::OK();
  }

  virtual uint64_t GetFileSize() {
    return filesize_;
  }
640

641
#ifdef OS_LINUX
642
  virtual Status Allocate(off_t offset, off_t len) {
643
    TEST_KILL_RANDOM(leveldb_kill_odds);
644 645 646 647 648 649
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
650
#endif
651 652
};

653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
  mutex_lockedFiles.Lock();
  if (lock) {
    // If it already exists in the lockedFiles set, then it is already locked,
    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
    // This check is needed because fcntl() does not detect lock conflict
    // if the fcntl is issued by the same thread that earlier acquired
    // this lock.
    if (lockedFiles.insert(fname).second == false) {
      mutex_lockedFiles.Unlock();
      errno = ENOLCK;
      return -1;
    }
  } else {
    // If we are unlocking, then verify that we had locked it earlier,
    // it should already exist in lockedFiles. Remove it from lockedFiles.
    if (lockedFiles.erase(fname) != 1) {
      mutex_lockedFiles.Unlock();
      errno = ENOLCK;
      return -1;
    }
  }
J
jorlow@chromium.org 已提交
675 676 677 678 679 680 681
  errno = 0;
  struct flock f;
  memset(&f, 0, sizeof(f));
  f.l_type = (lock ? F_WRLCK : F_UNLCK);
  f.l_whence = SEEK_SET;
  f.l_start = 0;
  f.l_len = 0;        // Lock/unlock entire file
682 683 684 685 686 687 688
  int value = fcntl(fd, F_SETLK, &f);
  if (value == -1 && lock) {
    // if there is an error in locking, then remove the pathname from lockedfiles
    lockedFiles.erase(fname);
  }
  mutex_lockedFiles.Unlock();
  return value;
J
jorlow@chromium.org 已提交
689 690 691 692 693
}

class PosixFileLock : public FileLock {
 public:
  int fd_;
694
  std::string filename;
J
jorlow@chromium.org 已提交
695 696 697 698 699
};

class PosixEnv : public Env {
 public:
  PosixEnv();
700 701 702

  virtual ~PosixEnv(){
    WaitForBGThreads();
J
jorlow@chromium.org 已提交
703 704
  }

705 706 707 708 709 710
  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
    if ((options == nullptr || options->IsFDCloseOnExec()) && fd > 0) {
      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
    }
  }

J
jorlow@chromium.org 已提交
711
  virtual Status NewSequentialFile(const std::string& fname,
712 713
                                   unique_ptr<SequentialFile>* result,
                                   const EnvOptions& options) {
714
    result->reset();
J
jorlow@chromium.org 已提交
715
    FILE* f = fopen(fname.c_str(), "r");
A
Abhishek Kona 已提交
716 717
    if (f == nullptr) {
      *result = nullptr;
718
      return IOError(fname, errno);
J
jorlow@chromium.org 已提交
719
    } else {
720 721
      int fd = fileno(f);
      SetFD_CLOEXEC(fd, &options);
722
      result->reset(new PosixSequentialFile(fname, f, options));
J
jorlow@chromium.org 已提交
723 724 725 726 727
      return Status::OK();
    }
  }

  virtual Status NewRandomAccessFile(const std::string& fname,
728 729
                                     unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& options) {
730
    result->reset();
731
    Status s;
J
jorlow@chromium.org 已提交
732
    int fd = open(fname.c_str(), O_RDONLY);
733
    SetFD_CLOEXEC(fd, &options);
J
jorlow@chromium.org 已提交
734
    if (fd < 0) {
735
      s = IOError(fname, errno);
736
    } else if (options.UseMmapReads() && sizeof(void*) >= 8) {
737 738 739 740 741 742
      // Use of mmap for random reads has been removed because it
      // kills performance when storage is fast.
      // Use mmap when virtual address-space is plentiful.
      uint64_t size;
      s = GetFileSize(fname, &size);
      if (s.ok()) {
A
Abhishek Kona 已提交
743
        void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
744
        if (base != MAP_FAILED) {
745
          result->reset(new PosixMmapReadableFile(fname, base, size, options));
746 747 748 749 750
        } else {
          s = IOError(fname, errno);
        }
      }
      close(fd);
751
    } else {
752
      result->reset(new PosixRandomAccessFile(fname, fd, options));
J
jorlow@chromium.org 已提交
753
    }
754
    return s;
J
jorlow@chromium.org 已提交
755 756 757
  }

  virtual Status NewWritableFile(const std::string& fname,
758 759
                                 unique_ptr<WritableFile>* result,
                                 const EnvOptions& options) {
760
    result->reset();
J
jorlow@chromium.org 已提交
761 762 763
    Status s;
    const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
    if (fd < 0) {
764
      s = IOError(fname, errno);
J
jorlow@chromium.org 已提交
765
    } else {
766
      SetFD_CLOEXEC(fd, &options);
767 768 769
      if (options.UseMmapWrites()) {
        if (!checkedDiskForMmap_) {
          // this will be executed once in the program's lifetime.
A
Abhishek Kona 已提交
770
          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
771 772 773 774
          if (!SupportsFastAllocate(fname)) {
            forceMmapOff = true;
          }
          checkedDiskForMmap_ = true;
A
Abhishek Kona 已提交
775 776
        }
      }
777 778
      if (options.UseMmapWrites() && !forceMmapOff) {
        result->reset(new PosixMmapFile(fname, fd, page_size_, options));
779
      } else {
780
        result->reset(new PosixWritableFile(fname, fd, 65536, options));
781
      }
J
jorlow@chromium.org 已提交
782 783 784 785 786 787 788 789 790 791 792 793
    }
    return s;
  }

  virtual bool FileExists(const std::string& fname) {
    return access(fname.c_str(), F_OK) == 0;
  }

  virtual Status GetChildren(const std::string& dir,
                             std::vector<std::string>* result) {
    result->clear();
    DIR* d = opendir(dir.c_str());
A
Abhishek Kona 已提交
794
    if (d == nullptr) {
795
      return IOError(dir, errno);
J
jorlow@chromium.org 已提交
796 797
    }
    struct dirent* entry;
A
Abhishek Kona 已提交
798
    while ((entry = readdir(d)) != nullptr) {
J
jorlow@chromium.org 已提交
799 800 801 802 803 804 805 806 807
      result->push_back(entry->d_name);
    }
    closedir(d);
    return Status::OK();
  }

  virtual Status DeleteFile(const std::string& fname) {
    Status result;
    if (unlink(fname.c_str()) != 0) {
808
      result = IOError(fname, errno);
J
jorlow@chromium.org 已提交
809 810 811 812 813 814 815
    }
    return result;
  };

  virtual Status CreateDir(const std::string& name) {
    Status result;
    if (mkdir(name.c_str(), 0755) != 0) {
816
      result = IOError(name, errno);
J
jorlow@chromium.org 已提交
817 818 819 820
    }
    return result;
  };

821 822 823 824 825
  virtual Status CreateDirIfMissing(const std::string& name) {
    Status result;
    if (mkdir(name.c_str(), 0755) != 0) {
      if (errno != EEXIST) {
        result = IOError(name, errno);
826 827 828 829
      } else if (!DirExists(name)) { // Check that name is actually a
                                     // directory.
        // Message is taken from mkdir
        result = Status::IOError("`"+name+"' exists but is not a directory");
830 831 832 833 834
      }
    }
    return result;
  };

J
jorlow@chromium.org 已提交
835 836 837
  virtual Status DeleteDir(const std::string& name) {
    Status result;
    if (rmdir(name.c_str()) != 0) {
838
      result = IOError(name, errno);
J
jorlow@chromium.org 已提交
839 840 841 842 843 844 845 846 847
    }
    return result;
  };

  virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
    Status s;
    struct stat sbuf;
    if (stat(fname.c_str(), &sbuf) != 0) {
      *size = 0;
848
      s = IOError(fname, errno);
J
jorlow@chromium.org 已提交
849 850 851 852 853 854
    } else {
      *size = sbuf.st_size;
    }
    return s;
  }

855 856 857 858 859 860 861 862 863
  virtual Status GetFileModificationTime(const std::string& fname,
                                         uint64_t* file_mtime) {
    struct stat s;
    if (stat(fname.c_str(), &s) !=0) {
      return IOError(fname, errno);
    }
    *file_mtime = static_cast<uint64_t>(s.st_mtime);
    return Status::OK();
  }
J
jorlow@chromium.org 已提交
864 865 866
  virtual Status RenameFile(const std::string& src, const std::string& target) {
    Status result;
    if (rename(src.c_str(), target.c_str()) != 0) {
867
      result = IOError(src, errno);
J
jorlow@chromium.org 已提交
868 869 870 871 872
    }
    return result;
  }

  virtual Status LockFile(const std::string& fname, FileLock** lock) {
A
Abhishek Kona 已提交
873
    *lock = nullptr;
J
jorlow@chromium.org 已提交
874 875 876
    Status result;
    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
877
      result = IOError(fname, errno);
878
    } else if (LockOrUnlock(fname, fd, true) == -1) {
879
      result = IOError("lock " + fname, errno);
J
jorlow@chromium.org 已提交
880 881
      close(fd);
    } else {
882
      SetFD_CLOEXEC(fd, nullptr);
J
jorlow@chromium.org 已提交
883 884
      PosixFileLock* my_lock = new PosixFileLock;
      my_lock->fd_ = fd;
885
      my_lock->filename = fname;
J
jorlow@chromium.org 已提交
886 887 888 889 890 891 892 893
      *lock = my_lock;
    }
    return result;
  }

  virtual Status UnlockFile(FileLock* lock) {
    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
    Status result;
894
    if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
895
      result = IOError("unlock", errno);
J
jorlow@chromium.org 已提交
896 897 898 899 900 901 902 903
    }
    close(my_lock->fd_);
    delete my_lock;
    return result;
  }

  virtual void Schedule(void (*function)(void*), void* arg);

904 905
  virtual void WaitForBGThreads();

J
jorlow@chromium.org 已提交
906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
  virtual void StartThread(void (*function)(void* arg), void* arg);

  virtual Status GetTestDirectory(std::string* result) {
    const char* env = getenv("TEST_TMPDIR");
    if (env && env[0] != '\0') {
      *result = env;
    } else {
      char buf[100];
      snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid()));
      *result = buf;
    }
    // Directory may already exist
    CreateDir(*result);
    return Status::OK();
  }

922
  static uint64_t gettid() {
J
jorlow@chromium.org 已提交
923 924
    pthread_t tid = pthread_self();
    uint64_t thread_id = 0;
J
jorlow@chromium.org 已提交
925
    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
926 927
    return thread_id;
  }
J
jorlow@chromium.org 已提交
928

929 930
  virtual Status NewLogger(const std::string& fname,
                           shared_ptr<Logger>* result) {
931
    FILE* f = fopen(fname.c_str(), "w");
A
Abhishek Kona 已提交
932
    if (f == nullptr) {
933
      result->reset();
934 935
      return IOError(fname, errno);
    } else {
936 937
      int fd = fileno(f);
      SetFD_CLOEXEC(fd, nullptr);
938
      result->reset(new PosixLogger(f, &PosixEnv::gettid));
939
      return Status::OK();
J
jorlow@chromium.org 已提交
940 941 942 943 944
    }
  }

  virtual uint64_t NowMicros() {
    struct timeval tv;
A
Abhishek Kona 已提交
945
    gettimeofday(&tv, nullptr);
J
jorlow@chromium.org 已提交
946 947 948 949 950 951 952
    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
  }

  virtual void SleepForMicroseconds(int micros) {
    usleep(micros);
  }

953
  virtual Status GetHostName(char* name, uint64_t len) {
954 955 956 957 958 959 960 961 962 963 964
    int ret = gethostname(name, len);
    if (ret < 0) {
      if (errno == EFAULT || errno == EINVAL)
        return Status::InvalidArgument(strerror(errno));
      else
        return IOError("GetHostName", errno);
    }
    return Status::OK();
  }

  virtual Status GetCurrentTime(int64_t* unix_time) {
A
Abhishek Kona 已提交
965
    time_t ret = time(nullptr);
966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981
    if (ret == (time_t) -1) {
      return IOError("GetCurrentTime", errno);
    }
    *unix_time = (int64_t) ret;
    return Status::OK();
  }

  virtual Status GetAbsolutePath(const std::string& db_path,
      std::string* output_path) {
    if (db_path.find('/') == 0) {
      *output_path = db_path;
      return Status::OK();
    }

    char the_path[256];
    char* ret = getcwd(the_path, 256);
A
Abhishek Kona 已提交
982
    if (ret == nullptr) {
983 984 985 986 987 988 989
      return Status::IOError(strerror(errno));
    }

    *output_path = ret;
    return Status::OK();
  }

A
Abhishek Kona 已提交
990
  // Allow increasing the number of worker threads.
991
  virtual void SetBackgroundThreads(int num) {
H
Haobo Xu 已提交
992
    PthreadCall("lock", pthread_mutex_lock(&mu_));
993 994 995 996
    if (num > num_threads_) {
      num_threads_ = num;
      bgthread_.resize(num_threads_);
    }
H
Haobo Xu 已提交
997
    PthreadCall("unlock", pthread_mutex_unlock(&mu_));
998 999
  }

1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
  virtual std::string TimeToString(uint64_t secondsSince1970) {
    const time_t seconds = (time_t)secondsSince1970;
    struct tm t;
    int maxsize = 64;
    std::string dummy;
    dummy.reserve(maxsize);
    dummy.resize(maxsize);
    char* p = &dummy[0];
    localtime_r(&seconds, &t);
    snprintf(p, maxsize,
             "%04d/%02d/%02d-%02d:%02d:%02d ",
             t.tm_year + 1900,
             t.tm_mon + 1,
             t.tm_mday,
             t.tm_hour,
             t.tm_min,
             t.tm_sec);
    return dummy;
  }

J
jorlow@chromium.org 已提交
1020
 private:
1021 1022
  bool checkedDiskForMmap_;
  bool forceMmapOff; // do we override Env options?
A
Abhishek Kona 已提交
1023

J
jorlow@chromium.org 已提交
1024 1025 1026 1027 1028 1029 1030
  void PthreadCall(const char* label, int result) {
    if (result != 0) {
      fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
      exit(1);
    }
  }

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
  // Returns true iff the named directory exists and is a directory.
  virtual bool DirExists(const std::string& dname) {
    struct stat statbuf;
    if (stat(dname.c_str(), &statbuf) == 0) {
      return S_ISDIR(statbuf.st_mode);
    }
    return false; // stat() failed return false
  }


J
jorlow@chromium.org 已提交
1041 1042 1043 1044
  // BGThread() is the body of the background thread
  void BGThread();
  static void* BGThreadWrapper(void* arg) {
    reinterpret_cast<PosixEnv*>(arg)->BGThread();
A
Abhishek Kona 已提交
1045
    return nullptr;
J
jorlow@chromium.org 已提交
1046 1047
  }

A
Abhishek Kona 已提交
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
  bool SupportsFastAllocate(const std::string& path) {
    struct statfs s;
    if (statfs(path.c_str(), &s)){
      return false;
    }
    switch (s.f_type) {
      case EXT4_SUPER_MAGIC:
        return true;
      case XFS_SUPER_MAGIC:
        return true;
      case TMPFS_MAGIC:
        return true;
      default:
        return false;
    }
  }

J
jorlow@chromium.org 已提交
1065 1066 1067
  size_t page_size_;
  pthread_mutex_t mu_;
  pthread_cond_t bgsignal_;
1068 1069 1070
  std::vector<pthread_t> bgthread_;
  int started_bgthread_;
  int num_threads_;
J
jorlow@chromium.org 已提交
1071 1072 1073 1074

  // Entry per Schedule() call
  struct BGItem { void* arg; void (*function)(void*); };
  typedef std::deque<BGItem> BGQueue;
1075 1076
  int queue_size_; // number of items in BGQueue
  bool exit_all_threads_;
J
jorlow@chromium.org 已提交
1077
  BGQueue queue_;
1078
  std::vector<pthread_t> threads_to_join_;
J
jorlow@chromium.org 已提交
1079 1080
};

1081 1082 1083
PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
                       forceMmapOff(false),
                       page_size_(getpagesize()),
1084
                       started_bgthread_(0),
1085 1086 1087
                       num_threads_(1),
                       queue_size_(0),
                       exit_all_threads_(false) {
A
Abhishek Kona 已提交
1088 1089
  PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
  PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
1090
  bgthread_.resize(num_threads_);
J
jorlow@chromium.org 已提交
1091 1092
}

1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
// Signal and Join all background threads started by calls to Schedule
void PosixEnv::WaitForBGThreads() {
  PthreadCall("lock", pthread_mutex_lock(&mu_));
  assert(! exit_all_threads_);
  exit_all_threads_ = true;
  PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
  for (unsigned int i = 0; i < threads_to_join_.size(); i++) {
    pthread_join(threads_to_join_[i], nullptr);
  }
}

J
jorlow@chromium.org 已提交
1105 1106 1107
void PosixEnv::Schedule(void (*function)(void*), void* arg) {
  PthreadCall("lock", pthread_mutex_lock(&mu_));

1108 1109 1110 1111
  if (exit_all_threads_) {
    PthreadCall("unlock", pthread_mutex_unlock(&mu_));
    return;
  }
J
jorlow@chromium.org 已提交
1112
  // Start background thread if necessary
1113
  for (; started_bgthread_ < num_threads_; started_bgthread_++) {
J
jorlow@chromium.org 已提交
1114 1115
    PthreadCall(
        "create thread",
A
Abhishek Kona 已提交
1116 1117 1118 1119
        pthread_create(&bgthread_[started_bgthread_],
                       nullptr,
                       &PosixEnv::BGThreadWrapper,
                       this));
1120
    threads_to_join_.push_back(bgthread_[started_bgthread_]);
1121
    fprintf(stdout, "Created bg thread 0x%lx\n", bgthread_[started_bgthread_]);
J
jorlow@chromium.org 已提交
1122 1123 1124 1125 1126 1127
  }

  // Add to priority queue
  queue_.push_back(BGItem());
  queue_.back().function = function;
  queue_.back().arg = arg;
H
Haobo Xu 已提交
1128 1129 1130

  // always wake up at least one waiting thread.
  PthreadCall("signal", pthread_cond_signal(&bgsignal_));
J
jorlow@chromium.org 已提交
1131 1132 1133 1134 1135 1136 1137 1138

  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
}

void PosixEnv::BGThread() {
  while (true) {
    // Wait until there is an item that is ready to run
    PthreadCall("lock", pthread_mutex_lock(&mu_));
1139
    while (queue_.empty() && !exit_all_threads_) {
J
jorlow@chromium.org 已提交
1140 1141
      PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
    }
1142 1143 1144 1145
    if (exit_all_threads_) { // mechanism to let BG threads exit safely
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
      break;
    }
J
jorlow@chromium.org 已提交
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
    void (*function)(void*) = queue_.front().function;
    void* arg = queue_.front().arg;
    queue_.pop_front();

    PthreadCall("unlock", pthread_mutex_unlock(&mu_));
    (*function)(arg);
  }
}

namespace {
struct StartThreadState {
  void (*user_function)(void*);
  void* arg;
};
}
static void* StartThreadWrapper(void* arg) {
  StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
  state->user_function(state->arg);
  delete state;
A
Abhishek Kona 已提交
1165
  return nullptr;
J
jorlow@chromium.org 已提交
1166 1167 1168 1169 1170 1171 1172 1173
}

void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
  pthread_t t;
  StartThreadState* state = new StartThreadState;
  state->user_function = function;
  state->arg = arg;
  PthreadCall("start thread",
A
Abhishek Kona 已提交
1174
              pthread_create(&t, nullptr,  &StartThreadWrapper, state));
1175
  threads_to_join_.push_back(t);
J
jorlow@chromium.org 已提交
1176 1177
}

H
Hans Wennborg 已提交
1178
}  // namespace
J
jorlow@chromium.org 已提交
1179 1180

Env* Env::Default() {
1181
  static PosixEnv default_env;
1182
  return &default_env;
J
jorlow@chromium.org 已提交
1183 1184
}

H
Hans Wennborg 已提交
1185
}  // namespace leveldb