env_posix.cc 34.3 KB
Newer Older
J
jorlow@chromium.org 已提交
1 2 3 4 5
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <deque>
6
#include <set>
J
jorlow@chromium.org 已提交
7 8 9 10 11 12 13
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
14
#include <sys/ioctl.h>
J
jorlow@chromium.org 已提交
15 16
#include <sys/mman.h>
#include <sys/stat.h>
A
Abhishek Kona 已提交
17
#include <sys/statfs.h>
J
jorlow@chromium.org 已提交
18 19
#include <sys/time.h>
#include <sys/types.h>
20
#include <sys/vfs.h>
J
jorlow@chromium.org 已提交
21 22
#include <time.h>
#include <unistd.h>
23 24
#if defined(OS_LINUX)
#include <linux/fs.h>
25
#include <fcntl.h>
26
#endif
J
jorlow@chromium.org 已提交
27 28 29
#if defined(LEVELDB_PLATFORM_ANDROID)
#include <sys/stat.h>
#endif
30 31
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
J
jorlow@chromium.org 已提交
32
#include "port/port.h"
33
#include "util/coding.h"
J
jorlow@chromium.org 已提交
34
#include "util/logging.h"
35
#include "util/posix_logger.h"
36 37
#include "util/random.h"
#include <signal.h>
J
jorlow@chromium.org 已提交
38

A
Abhishek Kona 已提交
39 40 41 42 43 44 45 46 47 48
#if !defined(TMPFS_MAGIC)
#define TMPFS_MAGIC 0x01021994
#endif
#if !defined(XFS_SUPER_MAGIC)
#define XFS_SUPER_MAGIC 0x58465342
#endif
#if !defined(EXT4_SUPER_MAGIC)
#define EXT4_SUPER_MAGIC 0xEF53
#endif

49 50
// This is only set from db_stress.cc and for testing only.
// If non-zero, kill at various points in source code with probability 1/this
51
int rocksdb_kill_odds = 0;
52

53
namespace rocksdb {
J
jorlow@chromium.org 已提交
54

55

J
jorlow@chromium.org 已提交
56 57
namespace {

58 59 60 61
// list of pathnames that are locked
static std::set<std::string> lockedFiles;
static port::Mutex mutex_lockedFiles;

62 63 64 65
static Status IOError(const std::string& context, int err_number) {
  return Status::IOError(context, strerror(err_number));
}

66 67
#ifdef NDEBUG
// empty in release build
68
#define TEST_KILL_RANDOM(rocksdb_kill_odds)
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
#else

// Kill the process with probablity 1/odds for testing.
static void TestKillRandom(int odds, const std::string& srcfile,
                           int srcline) {
  time_t curtime = time(nullptr);
  Random r((uint32_t)curtime);

  assert(odds > 0);
  bool crash = r.OneIn(odds);
  if (crash) {
    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
    fflush(stdout);
    kill(getpid(), SIGTERM);
  }
}

// To avoid crashing always at some frequently executed codepaths (during
// kill random test), use this factor to reduce odds
#define REDUCE_ODDS 2
#define REDUCE_ODDS2 4

91 92 93
#define TEST_KILL_RANDOM(rocksdb_kill_odds) {   \
  if (rocksdb_kill_odds > 0) { \
    TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__);     \
94 95 96 97 98
  } \
}

#endif

J
jorlow@chromium.org 已提交
99 100 101 102
class PosixSequentialFile: public SequentialFile {
 private:
  std::string filename_;
  FILE* file_;
103
  int fd_;
104
  bool use_os_buffer_;
J
jorlow@chromium.org 已提交
105 106

 public:
107 108
  PosixSequentialFile(const std::string& fname, FILE* f,
      const EnvOptions& options)
109
      : filename_(fname), file_(f), fd_(fileno(f)),
H
Haobo Xu 已提交
110
        use_os_buffer_(options.use_os_buffer) {
111
  }
J
jorlow@chromium.org 已提交
112 113 114 115 116 117 118 119 120 121 122
  virtual ~PosixSequentialFile() { fclose(file_); }

  virtual Status Read(size_t n, Slice* result, char* scratch) {
    Status s;
    size_t r = fread_unlocked(scratch, 1, n, file_);
    *result = Slice(scratch, r);
    if (r < n) {
      if (feof(file_)) {
        // We leave status as ok if we hit the end of the file
      } else {
        // A partial read with an error: return a non-ok status
123
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
124 125
      }
    }
126
    if (!use_os_buffer_) {
127 128 129 130
      // we need to fadvise away the entire range of pages because
      // we do not want readahead pages to be cached.
      posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
    }
J
jorlow@chromium.org 已提交
131 132
    return s;
  }
133 134 135

  virtual Status Skip(uint64_t n) {
    if (fseek(file_, n, SEEK_CUR)) {
136
      return IOError(filename_, errno);
137 138 139
    }
    return Status::OK();
  }
140 141 142 143 144 145 146 147 148

  virtual Status InvalidateCache(size_t offset, size_t length) {
    // free OS pages
    int ret = posix_fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
  }
J
jorlow@chromium.org 已提交
149 150
};

151
// pread() based random-access
J
jorlow@chromium.org 已提交
152 153 154 155
class PosixRandomAccessFile: public RandomAccessFile {
 private:
  std::string filename_;
  int fd_;
156
  bool use_os_buffer_;
J
jorlow@chromium.org 已提交
157 158

 public:
159 160
  PosixRandomAccessFile(const std::string& fname, int fd,
                        const EnvOptions& options)
H
Haobo Xu 已提交
161 162
      : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
    assert(!options.use_mmap_reads);
163
  }
J
jorlow@chromium.org 已提交
164 165 166 167 168 169 170 171 172
  virtual ~PosixRandomAccessFile() { close(fd_); }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
    *result = Slice(scratch, (r < 0) ? 0 : r);
    if (r < 0) {
      // An error: return a non-ok status
173
      s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
174
    }
175
    if (!use_os_buffer_) {
176 177 178
      // we need to fadvise away the entire range of pages because
      // we do not want readahead pages to be cached.
      posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
179
    }
J
jorlow@chromium.org 已提交
180 181
    return s;
  }
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210

#if defined(OS_LINUX)
  virtual size_t GetUniqueId(char* id, size_t max_size) const {
    // TODO: possibly allow this function to handle tighter bounds.
    if (max_size < kMaxVarint64Length*3) {
      return 0;
    }

    struct stat buf;
    int result = fstat(fd_, &buf);
    if (result == -1) {
      return 0;
    }

    long version = 0;
    result = ioctl(fd_, FS_IOC_GETVERSION, &version);
    if (result == -1) {
      return 0;
    }
    uint64_t uversion = (uint64_t)version;

    char* rid = id;
    rid = EncodeVarint64(rid, buf.st_dev);
    rid = EncodeVarint64(rid, buf.st_ino);
    rid = EncodeVarint64(rid, uversion);
    assert(rid >= id);
    return static_cast<size_t>(rid-id);
  }
#endif
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

  virtual void Hint(AccessPattern pattern) {
    switch(pattern) {
      case NORMAL:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
        break;
      case RANDOM:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
        break;
      case SEQUENTIAL:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
        break;
      case WILLNEED:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
        break;
      case DONTNEED:
        posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
        break;
      default:
        assert(false);
        break;
    }
  }

235 236 237 238 239 240 241 242
  virtual Status InvalidateCache(size_t offset, size_t length) {
    // free OS pages
    int ret = posix_fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
  }
J
jorlow@chromium.org 已提交
243 244
};

245 246 247
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
 private:
248
  int fd_;
249 250 251 252 253 254
  std::string filename_;
  void* mmapped_region_;
  size_t length_;

 public:
  // base[0,length-1] contains the mmapped contents of the file.
255 256
  PosixMmapReadableFile(const int fd, const std::string& fname,
                        void* base, size_t length,
257
                        const EnvOptions& options)
258
      : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
H
Haobo Xu 已提交
259 260
    assert(options.use_mmap_reads);
    assert(options.use_os_buffer);
261
  }
262 263 264 265 266 267 268 269 270 271 272 273 274
  virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const {
    Status s;
    if (offset + n > length_) {
      *result = Slice();
      s = IOError(filename_, EINVAL);
    } else {
      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
    }
    return s;
  }
275 276 277 278 279 280 281 282
  virtual Status InvalidateCache(size_t offset, size_t length) {
    // free OS pages
    int ret = posix_fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
  }
283 284
};

J
jorlow@chromium.org 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file.  This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class PosixMmapFile : public WritableFile {
 private:
  std::string filename_;
  int fd_;
  size_t page_size_;
  size_t map_size_;       // How much extra memory to map at a time
  char* base_;            // The mapped region
  char* limit_;           // Limit of the mapped region
  char* dst_;             // Where to write next  (in range [base_,limit_])
  char* last_sync_;       // Where have we synced up to
  uint64_t file_offset_;  // Offset of base_ in file

  // Have we done an munmap of unsynced data?
  bool pending_sync_;

  // Roundup x to a multiple of y
  static size_t Roundup(size_t x, size_t y) {
    return ((x + y - 1) / y) * y;
  }

  size_t TruncateToPageBoundary(size_t s) {
    s -= (s & (page_size_ - 1));
    assert((s % page_size_) == 0);
    return s;
  }

315 316
  bool UnmapCurrentRegion() {
    bool result = true;
317
    TEST_KILL_RANDOM(rocksdb_kill_odds);
A
Abhishek Kona 已提交
318
    if (base_ != nullptr) {
J
jorlow@chromium.org 已提交
319 320 321 322
      if (last_sync_ < limit_) {
        // Defer syncing this data until next Sync() call, if any
        pending_sync_ = true;
      }
323 324 325
      if (munmap(base_, limit_ - base_) != 0) {
        result = false;
      }
J
jorlow@chromium.org 已提交
326
      file_offset_ += limit_ - base_;
A
Abhishek Kona 已提交
327 328 329 330
      base_ = nullptr;
      limit_ = nullptr;
      last_sync_ = nullptr;
      dst_ = nullptr;
J
jorlow@chromium.org 已提交
331 332 333 334 335 336

      // Increase the amount we map the next time, but capped at 1MB
      if (map_size_ < (1<<20)) {
        map_size_ *= 2;
      }
    }
337
    return result;
J
jorlow@chromium.org 已提交
338 339
  }

A
Abhishek Kona 已提交
340
  Status MapNewRegion() {
A
Abhishek Kona 已提交
341
    assert(base_ == nullptr);
A
Abhishek Kona 已提交
342

343
    TEST_KILL_RANDOM(rocksdb_kill_odds);
A
Abhishek Kona 已提交
344 345 346 347
    int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
    if (alloc_status != 0) {
      return Status::IOError("Error allocating space to file : " + filename_ +
        "Error : " + strerror(alloc_status));
J
jorlow@chromium.org 已提交
348
    }
A
Abhishek Kona 已提交
349

350
    TEST_KILL_RANDOM(rocksdb_kill_odds);
A
Abhishek Kona 已提交
351
    void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
J
jorlow@chromium.org 已提交
352 353
                     fd_, file_offset_);
    if (ptr == MAP_FAILED) {
A
Abhishek Kona 已提交
354
      return Status::IOError("MMap failed on " + filename_);
J
jorlow@chromium.org 已提交
355
    }
356

357
    TEST_KILL_RANDOM(rocksdb_kill_odds);
358

J
jorlow@chromium.org 已提交
359 360 361 362
    base_ = reinterpret_cast<char*>(ptr);
    limit_ = base_ + map_size_;
    dst_ = base_;
    last_sync_ = base_;
A
Abhishek Kona 已提交
363
    return Status::OK();
J
jorlow@chromium.org 已提交
364 365 366
  }

 public:
367 368
  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
                const EnvOptions& options)
J
jorlow@chromium.org 已提交
369 370 371 372
      : filename_(fname),
        fd_(fd),
        page_size_(page_size),
        map_size_(Roundup(65536, page_size)),
A
Abhishek Kona 已提交
373 374 375 376
        base_(nullptr),
        limit_(nullptr),
        dst_(nullptr),
        last_sync_(nullptr),
J
jorlow@chromium.org 已提交
377 378 379
        file_offset_(0),
        pending_sync_(false) {
    assert((page_size & (page_size - 1)) == 0);
H
Haobo Xu 已提交
380
    assert(options.use_mmap_writes);
J
jorlow@chromium.org 已提交
381 382 383 384 385 386 387 388 389 390 391 392
  }


  ~PosixMmapFile() {
    if (fd_ >= 0) {
      PosixMmapFile::Close();
    }
  }

  virtual Status Append(const Slice& data) {
    const char* src = data.data();
    size_t left = data.size();
393
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
394
    PrepareWrite(GetFileSize(), left);
J
jorlow@chromium.org 已提交
395 396 397 398 399
    while (left > 0) {
      assert(base_ <= dst_);
      assert(dst_ <= limit_);
      size_t avail = limit_ - dst_;
      if (avail == 0) {
A
Abhishek Kona 已提交
400 401 402 403 404
        if (UnmapCurrentRegion()) {
          Status s = MapNewRegion();
          if (!s.ok()) {
            return s;
          }
405
          TEST_KILL_RANDOM(rocksdb_kill_odds);
406
        }
J
jorlow@chromium.org 已提交
407 408 409 410 411 412 413 414
      }

      size_t n = (left <= avail) ? left : avail;
      memcpy(dst_, src, n);
      dst_ += n;
      src += n;
      left -= n;
    }
415
    TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
416 417 418 419 420 421
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    size_t unused = limit_ - dst_;
422

423
    TEST_KILL_RANDOM(rocksdb_kill_odds);
424

425 426 427
    if (!UnmapCurrentRegion()) {
      s = IOError(filename_, errno);
    } else if (unused > 0) {
J
jorlow@chromium.org 已提交
428 429
      // Trim the extra space at the end of the file
      if (ftruncate(fd_, file_offset_ - unused) < 0) {
430
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
431 432 433
      }
    }

434
    TEST_KILL_RANDOM(rocksdb_kill_odds);
435

J
jorlow@chromium.org 已提交
436 437
    if (close(fd_) < 0) {
      if (s.ok()) {
438
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
439 440 441 442
      }
    }

    fd_ = -1;
A
Abhishek Kona 已提交
443 444
    base_ = nullptr;
    limit_ = nullptr;
J
jorlow@chromium.org 已提交
445 446 447 448
    return s;
  }

  virtual Status Flush() {
449
    TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
450 451 452 453 454 455 456 457
    return Status::OK();
  }

  virtual Status Sync() {
    Status s;

    if (pending_sync_) {
      // Some unmapped data was not synced
458
      TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
459 460
      pending_sync_ = false;
      if (fdatasync(fd_) < 0) {
461
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
462
      }
463
      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
J
jorlow@chromium.org 已提交
464 465 466 467 468 469 470 471
    }

    if (dst_ > last_sync_) {
      // Find the beginnings of the pages that contain the first and last
      // bytes to be synced.
      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
      last_sync_ = dst_;
472
      TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
473
      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
474
        s = IOError(filename_, errno);
J
jorlow@chromium.org 已提交
475
      }
476
      TEST_KILL_RANDOM(rocksdb_kill_odds);
J
jorlow@chromium.org 已提交
477 478 479 480
    }

    return s;
  }
481 482 483 484 485 486 487

  /**
   * Flush data as well as metadata to stable storage.
   */
  virtual Status Fsync() {
    if (pending_sync_) {
      // Some unmapped data was not synced
488
      TEST_KILL_RANDOM(rocksdb_kill_odds);
489 490 491 492
      pending_sync_ = false;
      if (fsync(fd_) < 0) {
        return IOError(filename_, errno);
      }
493
      TEST_KILL_RANDOM(rocksdb_kill_odds);
494 495 496 497 498
    }
    // This invocation to Sync will not issue the call to
    // fdatasync because pending_sync_ has already been cleared.
    return Sync();
  }
499 500 501 502 503 504 505 506 507 508

  /**
   * Get the size of valid data in the file. This will not match the
   * size that is returned from the filesystem because we use mmap
   * to extend file by map_size every time.
   */
  virtual uint64_t GetFileSize() {
    size_t used = dst_ - base_;
    return file_offset_ + used;
  }
509

510 511 512 513 514 515 516 517 518
  virtual Status InvalidateCache(size_t offset, size_t length) {
    // free OS pages
    int ret = posix_fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
  }

519
#ifdef OS_LINUX
520
  virtual Status Allocate(off_t offset, off_t len) {
521
    TEST_KILL_RANDOM(rocksdb_kill_odds);
522 523 524 525 526 527
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
528
#endif
J
jorlow@chromium.org 已提交
529 530
};

531 532 533 534 535 536 537
// Use posix write to write data to a file.
class PosixWritableFile : public WritableFile {
 private:
  const std::string filename_;
  int fd_;
  size_t cursize_;      // current size of cached data in buf_
  size_t capacity_;     // max size of buf_
M
Mayank Agarwal 已提交
538
  unique_ptr<char[]> buf_;           // a buffer to cache writes
539 540 541
  uint64_t filesize_;
  bool pending_sync_;
  bool pending_fsync_;
542
  uint64_t last_sync_size_;
H
Haobo Xu 已提交
543
  uint64_t bytes_per_sync_;
544 545

 public:
546 547
  PosixWritableFile(const std::string& fname, int fd, size_t capacity,
                    const EnvOptions& options) :
548 549 550 551 552 553 554
    filename_(fname),
    fd_(fd),
    cursize_(0),
    capacity_(capacity),
    buf_(new char[capacity]),
    filesize_(0),
    pending_sync_(false),
555
    pending_fsync_(false),
H
Haobo Xu 已提交
556 557
    last_sync_size_(0),
    bytes_per_sync_(options.bytes_per_sync) {
H
Haobo Xu 已提交
558
    assert(!options.use_mmap_writes);
559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
  }

  ~PosixWritableFile() {
    if (fd_ >= 0) {
      PosixWritableFile::Close();
    }
  }

  virtual Status Append(const Slice& data) {
    char* src = (char *)data.data();
    size_t left = data.size();
    Status s;
    pending_sync_ = true;
    pending_fsync_ = true;

574
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
575

576
    PrepareWrite(GetFileSize(), left);
577 578 579 580 581 582 583 584 585
    // if there is no space in the cache, then flush
    if (cursize_ + left > capacity_) {
      s = Flush();
      if (!s.ok()) {
        return s;
      }
      // Increase the buffer size, but capped at 1MB
      if (capacity_ < (1<<20)) {
        capacity_ *= 2;
M
Mayank Agarwal 已提交
586
        buf_.reset(new char[capacity_]);
587 588 589 590 591 592 593
      }
      assert(cursize_ == 0);
    }

    // if the write fits into the cache, then write to cache
    // otherwise do a write() syscall to write to OS buffers.
    if (cursize_ + left <= capacity_) {
M
Mayank Agarwal 已提交
594
      memcpy(buf_.get()+cursize_, src, left);
595 596 597
      cursize_ += left;
    } else {
      while (left != 0) {
C
Chip Turner 已提交
598
        ssize_t done = write(fd_, src, left);
599 600 601
        if (done < 0) {
          return IOError(filename_, errno);
        }
602
        TEST_KILL_RANDOM(rocksdb_kill_odds);
603

604 605 606 607 608 609 610 611 612 613 614 615 616
        left -= done;
        src += done;
      }
    }
    filesize_ += data.size();
    return Status::OK();
  }

  virtual Status Close() {
    Status s;
    s = Flush(); // flush cache to OS
    if (!s.ok()) {
    }
617

618
    TEST_KILL_RANDOM(rocksdb_kill_odds);
619

620 621 622 623 624 625 626 627 628 629 630
    if (close(fd_) < 0) {
      if (s.ok()) {
        s = IOError(filename_, errno);
      }
    }
    fd_ = -1;
    return s;
  }

  // write out the cached data to the OS cache
  virtual Status Flush() {
631
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
632
    size_t left = cursize_;
M
Mayank Agarwal 已提交
633
    char* src = buf_.get();
634
    while (left != 0) {
C
Chip Turner 已提交
635
      ssize_t done = write(fd_, src, left);
636 637 638
      if (done < 0) {
        return IOError(filename_, errno);
      }
639
      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
640 641 642 643
      left -= done;
      src += done;
    }
    cursize_ = 0;
644

H
Haobo Xu 已提交
645 646 647 648 649 650
    // sync OS cache to disk for every bytes_per_sync_
    // TODO: give log file and sst file different options (log
    // files could be potentially cached in OS for their whole
    // life time, thus we might not want to flush at all).
    if (bytes_per_sync_ &&
        filesize_ - last_sync_size_ >= bytes_per_sync_) {
651 652 653 654
      RangeSync(last_sync_size_, filesize_ - last_sync_size_);
      last_sync_size_ = filesize_;
    }

655 656 657 658
    return Status::OK();
  }

  virtual Status Sync() {
659
    TEST_KILL_RANDOM(rocksdb_kill_odds);
660 661 662
    if (pending_sync_ && fdatasync(fd_) < 0) {
      return IOError(filename_, errno);
    }
663
    TEST_KILL_RANDOM(rocksdb_kill_odds);
664 665 666 667 668
    pending_sync_ = false;
    return Status::OK();
  }

  virtual Status Fsync() {
669
    TEST_KILL_RANDOM(rocksdb_kill_odds);
670 671 672
    if (pending_fsync_ && fsync(fd_) < 0) {
      return IOError(filename_, errno);
    }
673
    TEST_KILL_RANDOM(rocksdb_kill_odds);
674 675 676 677 678 679 680 681
    pending_fsync_ = false;
    pending_sync_ = false;
    return Status::OK();
  }

  virtual uint64_t GetFileSize() {
    return filesize_;
  }
682

683 684 685 686 687 688 689 690 691
  virtual Status InvalidateCache(size_t offset, size_t length) {
    // free OS pages
    int ret = posix_fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
    if (ret == 0) {
      return Status::OK();
    }
    return IOError(filename_, errno);
  }

692
#ifdef OS_LINUX
693
  virtual Status Allocate(off_t offset, off_t len) {
694
    TEST_KILL_RANDOM(rocksdb_kill_odds);
695 696 697 698 699 700
    if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
701 702 703 704 705 706 707 708

  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
    if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
      return Status::OK();
    } else {
      return IOError(filename_, errno);
    }
  }
709
#endif
710 711
};

712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
  mutex_lockedFiles.Lock();
  if (lock) {
    // If it already exists in the lockedFiles set, then it is already locked,
    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
    // This check is needed because fcntl() does not detect lock conflict
    // if the fcntl is issued by the same thread that earlier acquired
    // this lock.
    if (lockedFiles.insert(fname).second == false) {
      mutex_lockedFiles.Unlock();
      errno = ENOLCK;
      return -1;
    }
  } else {
    // If we are unlocking, then verify that we had locked it earlier,
    // it should already exist in lockedFiles. Remove it from lockedFiles.
    if (lockedFiles.erase(fname) != 1) {
      mutex_lockedFiles.Unlock();
      errno = ENOLCK;
      return -1;
    }
  }
J
jorlow@chromium.org 已提交
734 735 736 737 738 739 740
  errno = 0;
  struct flock f;
  memset(&f, 0, sizeof(f));
  f.l_type = (lock ? F_WRLCK : F_UNLCK);
  f.l_whence = SEEK_SET;
  f.l_start = 0;
  f.l_len = 0;        // Lock/unlock entire file
741 742 743 744 745 746 747
  int value = fcntl(fd, F_SETLK, &f);
  if (value == -1 && lock) {
    // if there is an error in locking, then remove the pathname from lockedfiles
    lockedFiles.erase(fname);
  }
  mutex_lockedFiles.Unlock();
  return value;
J
jorlow@chromium.org 已提交
748 749 750 751 752
}

class PosixFileLock : public FileLock {
 public:
  int fd_;
753
  std::string filename;
J
jorlow@chromium.org 已提交
754 755
};

756 757 758 759 760 761 762 763 764 765

namespace {
void PthreadCall(const char* label, int result) {
  if (result != 0) {
    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
    exit(1);
  }
}
}

J
jorlow@chromium.org 已提交
766 767 768
class PosixEnv : public Env {
 public:
  PosixEnv();
769 770

  virtual ~PosixEnv(){
771 772 773
    for (const auto tid : threads_to_join_) {
      pthread_join(tid, nullptr);
    }
J
jorlow@chromium.org 已提交
774 775
  }

776
  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
H
Haobo Xu 已提交
777
    if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
778 779 780 781
      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
    }
  }

J
jorlow@chromium.org 已提交
782
  virtual Status NewSequentialFile(const std::string& fname,
783 784
                                   unique_ptr<SequentialFile>* result,
                                   const EnvOptions& options) {
785
    result->reset();
J
jorlow@chromium.org 已提交
786
    FILE* f = fopen(fname.c_str(), "r");
A
Abhishek Kona 已提交
787 788
    if (f == nullptr) {
      *result = nullptr;
789
      return IOError(fname, errno);
J
jorlow@chromium.org 已提交
790
    } else {
791 792
      int fd = fileno(f);
      SetFD_CLOEXEC(fd, &options);
793
      result->reset(new PosixSequentialFile(fname, f, options));
J
jorlow@chromium.org 已提交
794 795 796 797 798
      return Status::OK();
    }
  }

  virtual Status NewRandomAccessFile(const std::string& fname,
799 800
                                     unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& options) {
801
    result->reset();
802
    Status s;
J
jorlow@chromium.org 已提交
803
    int fd = open(fname.c_str(), O_RDONLY);
804
    SetFD_CLOEXEC(fd, &options);
J
jorlow@chromium.org 已提交
805
    if (fd < 0) {
806
      s = IOError(fname, errno);
H
Haobo Xu 已提交
807
    } else if (options.use_mmap_reads && sizeof(void*) >= 8) {
808 809 810 811 812 813
      // Use of mmap for random reads has been removed because it
      // kills performance when storage is fast.
      // Use mmap when virtual address-space is plentiful.
      uint64_t size;
      s = GetFileSize(fname, &size);
      if (s.ok()) {
A
Abhishek Kona 已提交
814
        void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
815
        if (base != MAP_FAILED) {
816 817
          result->reset(new PosixMmapReadableFile(fd, fname, base,
                                                  size, options));
818 819 820 821 822
        } else {
          s = IOError(fname, errno);
        }
      }
      close(fd);
823
    } else {
824
      result->reset(new PosixRandomAccessFile(fname, fd, options));
J
jorlow@chromium.org 已提交
825
    }
826
    return s;
J
jorlow@chromium.org 已提交
827 828 829
  }

  virtual Status NewWritableFile(const std::string& fname,
830 831
                                 unique_ptr<WritableFile>* result,
                                 const EnvOptions& options) {
832
    result->reset();
J
jorlow@chromium.org 已提交
833 834 835
    Status s;
    const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
    if (fd < 0) {
836
      s = IOError(fname, errno);
J
jorlow@chromium.org 已提交
837
    } else {
838
      SetFD_CLOEXEC(fd, &options);
H
Haobo Xu 已提交
839
      if (options.use_mmap_writes) {
840 841
        if (!checkedDiskForMmap_) {
          // this will be executed once in the program's lifetime.
A
Abhishek Kona 已提交
842
          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
843 844 845 846
          if (!SupportsFastAllocate(fname)) {
            forceMmapOff = true;
          }
          checkedDiskForMmap_ = true;
A
Abhishek Kona 已提交
847 848
        }
      }
H
Haobo Xu 已提交
849
      if (options.use_mmap_writes && !forceMmapOff) {
850
        result->reset(new PosixMmapFile(fname, fd, page_size_, options));
851
      } else {
852
        result->reset(new PosixWritableFile(fname, fd, 65536, options));
853
      }
J
jorlow@chromium.org 已提交
854 855 856 857 858 859 860 861 862 863 864 865
    }
    return s;
  }

  virtual bool FileExists(const std::string& fname) {
    return access(fname.c_str(), F_OK) == 0;
  }

  virtual Status GetChildren(const std::string& dir,
                             std::vector<std::string>* result) {
    result->clear();
    DIR* d = opendir(dir.c_str());
A
Abhishek Kona 已提交
866
    if (d == nullptr) {
867
      return IOError(dir, errno);
J
jorlow@chromium.org 已提交
868 869
    }
    struct dirent* entry;
A
Abhishek Kona 已提交
870
    while ((entry = readdir(d)) != nullptr) {
J
jorlow@chromium.org 已提交
871 872 873 874 875 876 877 878 879
      result->push_back(entry->d_name);
    }
    closedir(d);
    return Status::OK();
  }

  virtual Status DeleteFile(const std::string& fname) {
    Status result;
    if (unlink(fname.c_str()) != 0) {
880
      result = IOError(fname, errno);
J
jorlow@chromium.org 已提交
881 882 883 884 885 886 887
    }
    return result;
  };

  virtual Status CreateDir(const std::string& name) {
    Status result;
    if (mkdir(name.c_str(), 0755) != 0) {
888
      result = IOError(name, errno);
J
jorlow@chromium.org 已提交
889 890 891 892
    }
    return result;
  };

893 894 895 896 897
  virtual Status CreateDirIfMissing(const std::string& name) {
    Status result;
    if (mkdir(name.c_str(), 0755) != 0) {
      if (errno != EEXIST) {
        result = IOError(name, errno);
898 899 900 901
      } else if (!DirExists(name)) { // Check that name is actually a
                                     // directory.
        // Message is taken from mkdir
        result = Status::IOError("`"+name+"' exists but is not a directory");
902 903 904 905 906
      }
    }
    return result;
  };

J
jorlow@chromium.org 已提交
907 908 909
  virtual Status DeleteDir(const std::string& name) {
    Status result;
    if (rmdir(name.c_str()) != 0) {
910
      result = IOError(name, errno);
J
jorlow@chromium.org 已提交
911 912 913 914 915 916 917 918 919
    }
    return result;
  };

  virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
    Status s;
    struct stat sbuf;
    if (stat(fname.c_str(), &sbuf) != 0) {
      *size = 0;
920
      s = IOError(fname, errno);
J
jorlow@chromium.org 已提交
921 922 923 924 925 926
    } else {
      *size = sbuf.st_size;
    }
    return s;
  }

927 928 929 930 931 932 933 934 935
  virtual Status GetFileModificationTime(const std::string& fname,
                                         uint64_t* file_mtime) {
    struct stat s;
    if (stat(fname.c_str(), &s) !=0) {
      return IOError(fname, errno);
    }
    *file_mtime = static_cast<uint64_t>(s.st_mtime);
    return Status::OK();
  }
J
jorlow@chromium.org 已提交
936 937 938
  virtual Status RenameFile(const std::string& src, const std::string& target) {
    Status result;
    if (rename(src.c_str(), target.c_str()) != 0) {
939
      result = IOError(src, errno);
J
jorlow@chromium.org 已提交
940 941 942 943 944
    }
    return result;
  }

  virtual Status LockFile(const std::string& fname, FileLock** lock) {
A
Abhishek Kona 已提交
945
    *lock = nullptr;
J
jorlow@chromium.org 已提交
946 947 948
    Status result;
    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
949
      result = IOError(fname, errno);
950
    } else if (LockOrUnlock(fname, fd, true) == -1) {
951
      result = IOError("lock " + fname, errno);
J
jorlow@chromium.org 已提交
952 953
      close(fd);
    } else {
954
      SetFD_CLOEXEC(fd, nullptr);
J
jorlow@chromium.org 已提交
955 956
      PosixFileLock* my_lock = new PosixFileLock;
      my_lock->fd_ = fd;
957
      my_lock->filename = fname;
J
jorlow@chromium.org 已提交
958 959 960 961 962 963 964 965
      *lock = my_lock;
    }
    return result;
  }

  virtual Status UnlockFile(FileLock* lock) {
    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
    Status result;
966
    if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
967
      result = IOError("unlock", errno);
J
jorlow@chromium.org 已提交
968 969 970 971 972 973
    }
    close(my_lock->fd_);
    delete my_lock;
    return result;
  }

974
  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW);
975

J
jorlow@chromium.org 已提交
976 977 978 979 980 981 982 983
  virtual void StartThread(void (*function)(void* arg), void* arg);

  virtual Status GetTestDirectory(std::string* result) {
    const char* env = getenv("TEST_TMPDIR");
    if (env && env[0] != '\0') {
      *result = env;
    } else {
      char buf[100];
984
      snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
J
jorlow@chromium.org 已提交
985 986 987 988 989 990 991
      *result = buf;
    }
    // Directory may already exist
    CreateDir(*result);
    return Status::OK();
  }

992
  static uint64_t gettid() {
J
jorlow@chromium.org 已提交
993 994
    pthread_t tid = pthread_self();
    uint64_t thread_id = 0;
J
jorlow@chromium.org 已提交
995
    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
996 997
    return thread_id;
  }
J
jorlow@chromium.org 已提交
998

999 1000
  virtual Status NewLogger(const std::string& fname,
                           shared_ptr<Logger>* result) {
1001
    FILE* f = fopen(fname.c_str(), "w");
A
Abhishek Kona 已提交
1002
    if (f == nullptr) {
1003
      result->reset();
1004 1005
      return IOError(fname, errno);
    } else {
1006 1007
      int fd = fileno(f);
      SetFD_CLOEXEC(fd, nullptr);
1008
      result->reset(new PosixLogger(f, &PosixEnv::gettid));
1009
      return Status::OK();
J
jorlow@chromium.org 已提交
1010 1011 1012 1013 1014
    }
  }

  virtual uint64_t NowMicros() {
    struct timeval tv;
A
Abhishek Kona 已提交
1015
    gettimeofday(&tv, nullptr);
J
jorlow@chromium.org 已提交
1016 1017 1018
    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
  }

1019 1020 1021 1022 1023 1024
  virtual uint64_t NowNanos() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
  }

J
jorlow@chromium.org 已提交
1025 1026 1027 1028
  virtual void SleepForMicroseconds(int micros) {
    usleep(micros);
  }

1029
  virtual Status GetHostName(char* name, uint64_t len) {
1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
    int ret = gethostname(name, len);
    if (ret < 0) {
      if (errno == EFAULT || errno == EINVAL)
        return Status::InvalidArgument(strerror(errno));
      else
        return IOError("GetHostName", errno);
    }
    return Status::OK();
  }

  virtual Status GetCurrentTime(int64_t* unix_time) {
A
Abhishek Kona 已提交
1041
    time_t ret = time(nullptr);
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
    if (ret == (time_t) -1) {
      return IOError("GetCurrentTime", errno);
    }
    *unix_time = (int64_t) ret;
    return Status::OK();
  }

  virtual Status GetAbsolutePath(const std::string& db_path,
      std::string* output_path) {
    if (db_path.find('/') == 0) {
      *output_path = db_path;
      return Status::OK();
    }

    char the_path[256];
    char* ret = getcwd(the_path, 256);
A
Abhishek Kona 已提交
1058
    if (ret == nullptr) {
1059 1060 1061 1062 1063 1064 1065
      return Status::IOError(strerror(errno));
    }

    *output_path = ret;
    return Status::OK();
  }

A
Abhishek Kona 已提交
1066
  // Allow increasing the number of worker threads.
1067 1068 1069
  virtual void SetBackgroundThreads(int num, Priority pri) {
    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
    thread_pools_[pri].SetBackgroundThreads(num);
1070 1071
  }

1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
  virtual std::string TimeToString(uint64_t secondsSince1970) {
    const time_t seconds = (time_t)secondsSince1970;
    struct tm t;
    int maxsize = 64;
    std::string dummy;
    dummy.reserve(maxsize);
    dummy.resize(maxsize);
    char* p = &dummy[0];
    localtime_r(&seconds, &t);
    snprintf(p, maxsize,
             "%04d/%02d/%02d-%02d:%02d:%02d ",
             t.tm_year + 1900,
             t.tm_mon + 1,
             t.tm_mday,
             t.tm_hour,
             t.tm_min,
             t.tm_sec);
    return dummy;
  }

J
jorlow@chromium.org 已提交
1092
 private:
1093 1094
  bool checkedDiskForMmap_;
  bool forceMmapOff; // do we override Env options?
A
Abhishek Kona 已提交
1095

J
jorlow@chromium.org 已提交
1096

1097 1098 1099 1100 1101 1102 1103 1104 1105
  // Returns true iff the named directory exists and is a directory.
  virtual bool DirExists(const std::string& dname) {
    struct stat statbuf;
    if (stat(dname.c_str(), &statbuf) == 0) {
      return S_ISDIR(statbuf.st_mode);
    }
    return false; // stat() failed return false
  }

A
Abhishek Kona 已提交
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
  bool SupportsFastAllocate(const std::string& path) {
    struct statfs s;
    if (statfs(path.c_str(), &s)){
      return false;
    }
    switch (s.f_type) {
      case EXT4_SUPER_MAGIC:
        return true;
      case XFS_SUPER_MAGIC:
        return true;
      case TMPFS_MAGIC:
        return true;
      default:
        return false;
    }
  }

J
jorlow@chromium.org 已提交
1123 1124 1125
  size_t page_size_;


1126 1127
  class ThreadPool {
   public:
1128

1129 1130 1131 1132 1133 1134 1135 1136
    ThreadPool() :
        total_threads_limit_(1),
        bgthreads_(0),
        queue_(),
        exit_all_threads_(false) {
      PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
      PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
    }
J
jorlow@chromium.org 已提交
1137

1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
    ~ThreadPool() {
      PthreadCall("lock", pthread_mutex_lock(&mu_));
      assert(!exit_all_threads_);
      exit_all_threads_ = true;
      PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
      for (const auto tid : bgthreads_) {
        pthread_join(tid, nullptr);
      }
    }
J
jorlow@chromium.org 已提交
1148

1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
    void BGThread() {
      while (true) {
        // Wait until there is an item that is ready to run
        PthreadCall("lock", pthread_mutex_lock(&mu_));
        while (queue_.empty() && !exit_all_threads_) {
          PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
        }
        if (exit_all_threads_) { // mechanism to let BG threads exit safely
          PthreadCall("unlock", pthread_mutex_unlock(&mu_));
          break;
        }
        void (*function)(void*) = queue_.front().function;
        void* arg = queue_.front().arg;
        queue_.pop_front();
H
Haobo Xu 已提交
1163

1164 1165 1166 1167
        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
        (*function)(arg);
      }
    }
J
jorlow@chromium.org 已提交
1168

1169 1170 1171 1172
    static void* BGThreadWrapper(void* arg) {
      reinterpret_cast<ThreadPool*>(arg)->BGThread();
      return nullptr;
    }
J
jorlow@chromium.org 已提交
1173

1174 1175 1176 1177 1178 1179 1180
    void SetBackgroundThreads(int num) {
      PthreadCall("lock", pthread_mutex_lock(&mu_));
      if (num > total_threads_limit_) {
        total_threads_limit_ = num;
      }
      assert(total_threads_limit_ > 0);
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
J
jorlow@chromium.org 已提交
1181
    }
1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210

    void Schedule(void (*function)(void*), void* arg) {
      PthreadCall("lock", pthread_mutex_lock(&mu_));

      if (exit_all_threads_) {
        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
        return;
      }
      // Start background thread if necessary
      while ((int)bgthreads_.size() < total_threads_limit_) {
        pthread_t t;
        PthreadCall(
          "create thread",
          pthread_create(&t,
                         nullptr,
                         &ThreadPool::BGThreadWrapper,
                         this));
        fprintf(stdout, "Created bg thread 0x%lx\n", t);
        bgthreads_.push_back(t);
      }

      // Add to priority queue
      queue_.push_back(BGItem());
      queue_.back().function = function;
      queue_.back().arg = arg;

      // always wake up at least one waiting thread.
      PthreadCall("signal", pthread_cond_signal(&bgsignal_));

1211 1212
      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
    }
J
jorlow@chromium.org 已提交
1213

1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
   private:
    // Entry per Schedule() call
    struct BGItem { void* arg; void (*function)(void*); };
    typedef std::deque<BGItem> BGQueue;

    pthread_mutex_t mu_;
    pthread_cond_t bgsignal_;
    int total_threads_limit_;
    std::vector<pthread_t> bgthreads_;
    BGQueue queue_;
    bool exit_all_threads_;
  };

  std::vector<ThreadPool> thread_pools_;

  pthread_mutex_t mu_;
  std::vector<pthread_t> threads_to_join_;

};

PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
                       forceMmapOff(false),
                       page_size_(getpagesize()),
                       thread_pools_(Priority::TOTAL) {
  PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
}

void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
  thread_pools_[pri].Schedule(function, arg);
J
jorlow@chromium.org 已提交
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
}

namespace {
struct StartThreadState {
  void (*user_function)(void*);
  void* arg;
};
}
static void* StartThreadWrapper(void* arg) {
  StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
  state->user_function(state->arg);
  delete state;
A
Abhishek Kona 已提交
1256
  return nullptr;
J
jorlow@chromium.org 已提交
1257 1258 1259 1260 1261 1262 1263 1264
}

void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
  pthread_t t;
  StartThreadState* state = new StartThreadState;
  state->user_function = function;
  state->arg = arg;
  PthreadCall("start thread",
A
Abhishek Kona 已提交
1265
              pthread_create(&t, nullptr,  &StartThreadWrapper, state));
1266
  PthreadCall("lock", pthread_mutex_lock(&mu_));
1267
  threads_to_join_.push_back(t);
1268
  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
J
jorlow@chromium.org 已提交
1269 1270
}

H
Hans Wennborg 已提交
1271
}  // namespace
J
jorlow@chromium.org 已提交
1272 1273

Env* Env::Default() {
1274
  static PosixEnv default_env;
1275
  return &default_env;
J
jorlow@chromium.org 已提交
1276 1277
}

1278
}  // namespace rocksdb