env_win.cc 59.2 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
D
Dmitri Smirnov 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <algorithm>
#include <deque>
#include <thread>
#include <ctime>

#include <errno.h>
#include <process.h>
#include <io.h>
#include <direct.h>
#include <sys/types.h>
#include <sys/stat.h>

#include "rocksdb/env.h"
#include "rocksdb/slice.h"

#include "port/port.h"
#include "port/dirent.h"
#include "port/win/win_logger.h"

#include "util/random.h"
#include "util/iostats_context_imp.h"
#include "util/rate_limiter.h"
32
#include "util/sync_point.h"
33
#include "util/aligned_buffer.h"
D
Dmitri Smirnov 已提交
34

35
#include "util/threadpool.h"
D
Dmitri Smirnov 已提交
36 37 38
#include "util/thread_status_updater.h"
#include "util/thread_status_util.h"

S
sdong 已提交
39
#include <Rpc.h>  // For UUID generation
D
Dmitri Smirnov 已提交
40 41
#include <Windows.h>

S
sdong 已提交
42
namespace rocksdb {
D
Dmitri Smirnov 已提交
43 44 45

std::string GetWindowsErrSz(DWORD err) {
  LPSTR lpMsgBuf;
S
sdong 已提交
46 47 48 49 50
  FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
                     FORMAT_MESSAGE_IGNORE_INSERTS,
                 NULL, err,
                 0,  // Default language
                 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
D
Dmitri Smirnov 已提交
51 52 53 54 55 56

  std::string Err = lpMsgBuf;
  LocalFree(lpMsgBuf);
  return Err;
}

S
sdong 已提交
57
namespace {
D
Dmitri Smirnov 已提交
58 59 60 61 62 63 64

const size_t c_OneMB = (1 << 20);

ThreadStatusUpdater* CreateThreadStatusUpdater() {
  return new ThreadStatusUpdater();
}

S
sdong 已提交
65
inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
D
Dmitri Smirnov 已提交
66 67 68
  return Status::IOError(context, GetWindowsErrSz(err));
}

S
sdong 已提交
69
inline Status IOErrorFromLastWindowsError(const std::string& context) {
D
Dmitri Smirnov 已提交
70 71 72
  return IOErrorFromWindowsError(context, GetLastError());
}

S
sdong 已提交
73
inline Status IOError(const std::string& context, int err_number) {
D
Dmitri Smirnov 已提交
74 75 76 77 78 79 80 81 82 83
  return Status::IOError(context, strerror(err_number));
}

// TODO(sdong): temp logging. Need to help debugging. Remove it when
// the feature is proved to be stable.
inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) {
  fprintf(stdout, "Bg thread %Iu terminates %Iu\n", thread_id, terminatingId);
}

// returns the ID of the current process
S
sdong 已提交
84
inline int current_process_id() { return _getpid(); }
D
Dmitri Smirnov 已提交
85 86 87 88 89

// RAII helpers for HANDLEs
const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;

S
sdong 已提交
90 91
// We preserve the original name of this interface to denote the original idea
// behind it.
D
Dmitri Smirnov 已提交
92
// All reads happen by a specified offset and pwrite interface does not change
S
sdong 已提交
93 94 95 96 97 98
// the position of the file pointer. Judging from the man page and errno it does
// execute
// lseek atomically to return the position of the file back where it was.
// WriteFile() does not
// have this capability. Therefore, for both pread and pwrite the pointer is
// advanced to the next position
D
Dmitri Smirnov 已提交
99
// which is fine for writes because they are (should be) sequential.
S
sdong 已提交
100 101
// Because all the reads/writes happen by the specified offset, the caller in
// theory should not
D
Dmitri Smirnov 已提交
102
// rely on the current file offset.
S
sdong 已提交
103 104
SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
               uint64_t offset) {
V
Vasili Svirski 已提交
105
  assert(numBytes <= std::numeric_limits<DWORD>::max());
S
sdong 已提交
106
  OVERLAPPED overlapped = {0};
D
Dmitri Smirnov 已提交
107 108 109 110 111 112 113 114 115 116
  ULARGE_INTEGER offsetUnion;
  offsetUnion.QuadPart = offset;

  overlapped.Offset = offsetUnion.LowPart;
  overlapped.OffsetHigh = offsetUnion.HighPart;

  SSIZE_T result = 0;

  unsigned long bytesWritten = 0;

V
Vasili Svirski 已提交
117 118
  if (FALSE == WriteFile(hFile, src, static_cast<DWORD>(numBytes), &bytesWritten,
    &overlapped)) {
S
sdong 已提交
119 120 121
    result = -1;
  } else {
    result = bytesWritten;
D
Dmitri Smirnov 已提交
122 123 124 125 126 127
  }

  return result;
}

// See comments for pwrite above
S
sdong 已提交
128
SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) {
V
Vasili Svirski 已提交
129
  assert(numBytes <= std::numeric_limits<DWORD>::max());
S
sdong 已提交
130
  OVERLAPPED overlapped = {0};
D
Dmitri Smirnov 已提交
131 132 133 134 135 136 137 138 139 140
  ULARGE_INTEGER offsetUnion;
  offsetUnion.QuadPart = offset;

  overlapped.Offset = offsetUnion.LowPart;
  overlapped.OffsetHigh = offsetUnion.HighPart;

  SSIZE_T result = 0;

  unsigned long bytesRead = 0;

141 142 143
  if (FALSE == ReadFile(hFile, src, static_cast<DWORD>(numBytes), &bytesRead,
    &overlapped)) {
    return -1;
S
sdong 已提交
144
  } else {
145
    result = bytesRead;
D
Dmitri Smirnov 已提交
146 147 148 149 150
  }

  return result;
}

S
sdong 已提交
151 152 153 154
// Note the below two do not set errno because they are used only here in this
// file
// on a Windows handle and, therefore, not necessary. Translating GetLastError()
// to errno
D
Dmitri Smirnov 已提交
155
// is a sad business
S
sdong 已提交
156
inline int fsync(HANDLE hFile) {
D
Dmitri Smirnov 已提交
157
  if (!FlushFileBuffers(hFile)) {
S
sdong 已提交
158
    return -1;
D
Dmitri Smirnov 已提交
159 160 161 162 163
  }

  return 0;
}

164 165 166
// SetFileInformationByHandle() is capable of fast pre-allocates.
// However, this does not change the file end position unless the file is
// truncated and the pre-allocated space is not considered filled with zeros.
S
sdong 已提交
167 168
inline Status fallocate(const std::string& filename, HANDLE hFile,
                        uint64_t to_size) {
D
Dmitri Smirnov 已提交
169 170 171 172 173
  Status status;

  FILE_ALLOCATION_INFO alloc_info;
  alloc_info.AllocationSize.QuadPart = to_size;

S
sdong 已提交
174 175
  if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
                                  sizeof(FILE_ALLOCATION_INFO))) {
D
Dmitri Smirnov 已提交
176
    auto lastError = GetLastError();
S
sdong 已提交
177 178
    status = IOErrorFromWindowsError(
        "Failed to pre-allocate space: " + filename, lastError);
D
Dmitri Smirnov 已提交
179 180 181 182 183
  }

  return status;
}

S
sdong 已提交
184 185
inline Status ftruncate(const std::string& filename, HANDLE hFile,
                        uint64_t toSize) {
D
Dmitri Smirnov 已提交
186 187 188 189 190
  Status status;

  FILE_END_OF_FILE_INFO end_of_file;
  end_of_file.EndOfFile.QuadPart = toSize;

S
sdong 已提交
191 192
  if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
                                  sizeof(FILE_END_OF_FILE_INFO))) {
D
Dmitri Smirnov 已提交
193
    auto lastError = GetLastError();
S
sdong 已提交
194 195
    status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
                                     lastError);
D
Dmitri Smirnov 已提交
196 197 198 199 200 201 202
  }

  return status;
}

// mmap() based random-access
class WinMmapReadableFile : public RandomAccessFile {
S
sdong 已提交
203 204 205
  const std::string fileName_;
  HANDLE hFile_;
  HANDLE hMap_;
D
Dmitri Smirnov 已提交
206

S
sdong 已提交
207 208
  const void* mapped_region_;
  const size_t length_;
D
Dmitri Smirnov 已提交
209

S
sdong 已提交
210
 public:
211
  // mapped_region_[0,length-1] contains the mmapped contents of the file.
S
sdong 已提交
212 213 214 215 216 217 218
  WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
                      const void* mapped_region, size_t length)
      : fileName_(fileName),
        hFile_(hFile),
        hMap_(hMap),
        mapped_region_(mapped_region),
        length_(length) {}
D
Dmitri Smirnov 已提交
219 220 221 222 223 224 225 226 227 228 229 230

  ~WinMmapReadableFile() {
    BOOL ret = ::UnmapViewOfFile(mapped_region_);
    assert(ret);

    ret = ::CloseHandle(hMap_);
    assert(ret);

    ret = ::CloseHandle(hFile_);
    assert(ret);
  }

S
sdong 已提交
231 232
  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const override {
D
Dmitri Smirnov 已提交
233 234
    Status s;

235
    if (offset > length_) {
S
sdong 已提交
236
      *result = Slice();
237 238 239
      return IOError(fileName_, EINVAL);
    } else if (offset + n > length_) {
      n = length_ - offset;
D
Dmitri Smirnov 已提交
240
    }
241 242
    *result =
        Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
D
Dmitri Smirnov 已提交
243 244 245 246 247 248 249 250 251 252 253 254 255
    return s;
  }

  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }
};

// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file.  This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class WinMmapFile : public WritableFile {
S
sdong 已提交
256
 private:
D
Dmitri Smirnov 已提交
257
  const std::string filename_;
S
sdong 已提交
258 259
  HANDLE hFile_;
  HANDLE hMap_;
D
Dmitri Smirnov 已提交
260

S
sdong 已提交
261 262 263 264 265
  const size_t page_size_;  // We flush the mapping view in page_size
                            // increments. We may decide if this is a memory
                            // page size or SSD page size
  const size_t
      allocation_granularity_;  // View must start at such a granularity
266 267 268 269 270

  size_t reserved_size_;      // Preallocated size

  size_t mapping_size_;         // The max size of the mapping object
                                // we want to guess the final file size to minimize the remapping
S
sdong 已提交
271
  size_t view_size_;            // How much memory to map into a view at a time
D
Dmitri Smirnov 已提交
272

S
sdong 已提交
273 274 275 276 277
  char* mapped_begin_;  // Must begin at the file offset that is aligned with
                        // allocation_granularity_
  char* mapped_end_;
  char* dst_;  // Where to write next  (in range [mapped_begin_,mapped_end_])
  char* last_sync_;  // Where have we synced up to
D
Dmitri Smirnov 已提交
278

S
sdong 已提交
279
  uint64_t file_offset_;  // Offset of mapped_begin_ in file
D
Dmitri Smirnov 已提交
280 281

  // Do we have unsynced writes?
S
sdong 已提交
282
  bool pending_sync_;
D
Dmitri Smirnov 已提交
283 284 285 286

  // Can only truncate or reserve to a sector size aligned if
  // used on files that are opened with Unbuffered I/O
  Status TruncateFile(uint64_t toSize) {
S
sdong 已提交
287
    return ftruncate(filename_, hFile_, toSize);
D
Dmitri Smirnov 已提交
288 289 290 291 292 293 294
  }

  Status UnmapCurrentRegion() {
    Status status;

    if (mapped_begin_ != nullptr) {
      if (!::UnmapViewOfFile(mapped_begin_)) {
S
sdong 已提交
295 296
        status = IOErrorFromWindowsError(
            "Failed to unmap file view: " + filename_, GetLastError());
D
Dmitri Smirnov 已提交
297 298
      }

299 300 301
      // Move on to the next portion of the file
      file_offset_ += view_size_;

D
Dmitri Smirnov 已提交
302 303 304 305
      // UnmapView automatically sends data to disk but not the metadata
      // which is good and provides some equivalent of fdatasync() on Linux
      // therefore, we donot need separate flag for metadata
      mapped_begin_ = nullptr;
S
sdong 已提交
306 307
      mapped_end_ = nullptr;
      dst_ = nullptr;
D
Dmitri Smirnov 已提交
308

309 310
      last_sync_ = nullptr;
      pending_sync_ = false;
D
Dmitri Smirnov 已提交
311 312 313 314 315 316
    }

    return status;
  }

  Status MapNewRegion() {
317

D
Dmitri Smirnov 已提交
318 319 320 321
    Status status;

    assert(mapped_begin_ == nullptr);

322 323 324 325 326 327
    size_t minDiskSize = file_offset_ + view_size_;

    if (minDiskSize > reserved_size_) {
      status = Allocate(file_offset_, view_size_);
      if (!status.ok()) {
        return status;
D
Dmitri Smirnov 已提交
328
      }
329 330 331 332
    }

    // Need to remap
    if (hMap_ == NULL || reserved_size_ > mapping_size_) {
D
Dmitri Smirnov 已提交
333

334
      if (hMap_ != NULL) {
D
Dmitri Smirnov 已提交
335 336 337 338 339 340 341
        // Unmap the previous one
        BOOL ret = ::CloseHandle(hMap_);
        assert(ret);
        hMap_ = NULL;
      }

      ULARGE_INTEGER mappingSize;
342
      mappingSize.QuadPart = reserved_size_;
D
Dmitri Smirnov 已提交
343 344 345

      hMap_ = CreateFileMappingA(
          hFile_,
S
sdong 已提交
346 347 348
          NULL,                  // Security attributes
          PAGE_READWRITE,        // There is not a write only mode for mapping
          mappingSize.HighPart,  // Enable mapping the whole file but the actual
349
                                  // amount mapped is determined by MapViewOfFile
D
Dmitri Smirnov 已提交
350 351 352
          mappingSize.LowPart,
          NULL);  // Mapping name

S
sdong 已提交
353 354 355 356 357
      if (NULL == hMap_) {
        return IOErrorFromWindowsError(
            "WindowsMmapFile failed to create file mapping for: " + filename_,
            GetLastError());
      }
358 359

      mapping_size_ = reserved_size_;
D
Dmitri Smirnov 已提交
360 361 362 363 364 365
    }

    ULARGE_INTEGER offset;
    offset.QuadPart = file_offset_;

    // View must begin at the granularity aligned offset
S
sdong 已提交
366 367 368
    mapped_begin_ = reinterpret_cast<char*>(
        MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
                        view_size_, NULL));
D
Dmitri Smirnov 已提交
369 370

    if (!mapped_begin_) {
S
sdong 已提交
371 372 373
      status = IOErrorFromWindowsError(
          "WindowsMmapFile failed to map file view: " + filename_,
          GetLastError());
D
Dmitri Smirnov 已提交
374
    } else {
S
sdong 已提交
375 376 377 378
      mapped_end_ = mapped_begin_ + view_size_;
      dst_ = mapped_begin_;
      last_sync_ = mapped_begin_;
      pending_sync_ = false;
D
Dmitri Smirnov 已提交
379 380 381 382
    }
    return status;
  }

S
sdong 已提交
383 384 385
 public:
  WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
              size_t allocation_granularity, const EnvOptions& options)
D
Dmitri Smirnov 已提交
386
      : filename_(fname),
S
sdong 已提交
387 388 389 390
        hFile_(hFile),
        hMap_(NULL),
        page_size_(page_size),
        allocation_granularity_(allocation_granularity),
391
        reserved_size_(0),
S
sdong 已提交
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
        mapping_size_(0),
        view_size_(0),
        mapped_begin_(nullptr),
        mapped_end_(nullptr),
        dst_(nullptr),
        last_sync_(nullptr),
        file_offset_(0),
        pending_sync_(false) {
    // Allocation granularity must be obtained from GetSystemInfo() and must be
    // a power of two.
    assert(allocation_granularity > 0);
    assert((allocation_granularity & (allocation_granularity - 1)) == 0);

    assert(page_size > 0);
    assert((page_size & (page_size - 1)) == 0);

    // Only for memory mapped writes
    assert(options.use_mmap_writes);

    // View size must be both the multiple of allocation_granularity AND the
412 413 414
    // page size and the granularity is usually a multiple of a page size.
    const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
    view_size_ = Roundup(viewSize, allocation_granularity_);
D
Dmitri Smirnov 已提交
415 416 417
  }

  ~WinMmapFile() {
S
sdong 已提交
418 419 420
    if (hFile_) {
      this->Close();
    }
D
Dmitri Smirnov 已提交
421 422 423 424 425 426 427 428 429 430 431
  }

  virtual Status Append(const Slice& data) override {
    const char* src = data.data();
    size_t left = data.size();

    while (left > 0) {
      assert(mapped_begin_ <= dst_);
      size_t avail = mapped_end_ - dst_;

      if (avail == 0) {
S
sdong 已提交
432 433 434 435
        Status s = UnmapCurrentRegion();
        if (s.ok()) {
          s = MapNewRegion();
        }
D
Dmitri Smirnov 已提交
436

S
sdong 已提交
437 438 439
        if (!s.ok()) {
          return s;
        }
440 441 442 443 444 445 446
      } else {
        size_t n = std::min(left, avail);
        memcpy(dst_, src, n);
        dst_ += n;
        src += n;
        left -= n;
        pending_sync_ = true;
D
Dmitri Smirnov 已提交
447
      }
448
    }
D
Dmitri Smirnov 已提交
449

450 451 452 453
    // Now make sure that the last partial page is padded with zeros if needed
    size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
    if (bytesToPad > 0) {
      memset(dst_, 0, bytesToPad);
D
Dmitri Smirnov 已提交
454 455 456 457 458
    }

    return Status::OK();
  }

459 460 461 462 463 464
  // Means Close() will properly take care of truncate
  // and it does not need any additional information
  virtual Status Truncate(uint64_t size) override {
    return Status::OK();
  }

D
Dmitri Smirnov 已提交
465 466 467 468 469 470 471 472 473 474
  virtual Status Close() override {
    Status s;

    assert(NULL != hFile_);

    // We truncate to the precise size so no
    // uninitialized data at the end. SetEndOfFile
    // which we use does not write zeros and it is good.
    uint64_t targetSize = GetFileSize();

475 476 477 478 479 480 481
    if (mapped_begin_ != nullptr) {
      // Sync before unmapping to make sure everything
      // is on disk and there is not a lazy writing
      // so we are deterministic with the tests
      Sync();
      s = UnmapCurrentRegion();
    }
D
Dmitri Smirnov 已提交
482

S
sdong 已提交
483
    if (NULL != hMap_) {
D
Dmitri Smirnov 已提交
484 485
      BOOL ret = ::CloseHandle(hMap_);
      if (!ret && s.ok()) {
S
sdong 已提交
486 487 488
        auto lastError = GetLastError();
        s = IOErrorFromWindowsError(
            "Failed to Close mapping for file: " + filename_, lastError);
D
Dmitri Smirnov 已提交
489 490 491 492 493
      }

      hMap_ = NULL;
    }

494 495 496
    if (hFile_ != NULL) {

      TruncateFile(targetSize);
D
Dmitri Smirnov 已提交
497

498 499
      BOOL ret = ::CloseHandle(hFile_);
      hFile_ = NULL;
D
Dmitri Smirnov 已提交
500

501 502 503 504 505
      if (!ret && s.ok()) {
        auto lastError = GetLastError();
        s = IOErrorFromWindowsError(
            "Failed to close file map handle: " + filename_, lastError);
      }
D
Dmitri Smirnov 已提交
506 507 508 509 510
    }

    return s;
  }

S
sdong 已提交
511
  virtual Status Flush() override { return Status::OK(); }
D
Dmitri Smirnov 已提交
512 513 514 515 516 517

  // Flush only data
  virtual Status Sync() override {
    Status s;

    // Some writes occurred since last sync
518
    if (dst_ > last_sync_) {
D
Dmitri Smirnov 已提交
519 520 521 522 523
      assert(mapped_begin_);
      assert(dst_);
      assert(dst_ > mapped_begin_);
      assert(dst_ < mapped_end_);

S
sdong 已提交
524 525 526 527
      size_t page_begin =
          TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
      size_t page_end =
          TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
D
Dmitri Smirnov 已提交
528 529

      // Flush only the amount of that is a multiple of pages
S
sdong 已提交
530
      if (!::FlushViewOfFile(mapped_begin_ + page_begin,
531
                              (page_end - page_begin) + page_size_)) {
S
sdong 已提交
532 533
        s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
                                    GetLastError());
534 535
      } else {
        last_sync_ = dst_;
D
Dmitri Smirnov 已提交
536 537 538 539 540 541 542 543 544 545
      }
    }

    return s;
  }

  /**
  * Flush data as well as metadata to stable storage.
  */
  virtual Status Fsync() override {
546
    Status s = Sync();
D
Dmitri Smirnov 已提交
547 548

    // Flush metadata
549
    if (s.ok() && pending_sync_) {
D
Dmitri Smirnov 已提交
550
      if (!::FlushFileBuffers(hFile_)) {
S
sdong 已提交
551 552
        s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
                                    GetLastError());
D
Dmitri Smirnov 已提交
553
      }
554
      pending_sync_ = false;
D
Dmitri Smirnov 已提交
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
    }

    return s;
  }

  /**
  * Get the size of valid data in the file. This will not match the
  * size that is returned from the filesystem because we use mmap
  * to extend file by map_size every time.
  */
  virtual uint64_t GetFileSize() override {
    size_t used = dst_ - mapped_begin_;
    return file_offset_ + used;
  }

  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }

574
  virtual Status Allocate(uint64_t offset, uint64_t len) override {
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
    Status status;
    TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);

    // Make sure that we reserve an aligned amount of space
    // since the reservation block size is driven outside so we want
    // to check if we are ok with reservation here
    size_t spaceToReserve = Roundup(offset + len, view_size_);
    // Nothing to do
    if (spaceToReserve <= reserved_size_) {
      return status;
    }

    IOSTATS_TIMER_GUARD(allocate_nanos);
    status = fallocate(filename_, hFile_, spaceToReserve);
    if (status.ok()) {
      reserved_size_ = spaceToReserve;
    }
    return status;
D
Dmitri Smirnov 已提交
593 594 595
  }
};

S
sdong 已提交
596 597
class WinSequentialFile : public SequentialFile {
 private:
D
Dmitri Smirnov 已提交
598
  const std::string filename_;
599 600 601
  HANDLE file_;

  // There is no equivalent of advising away buffered pages as in posix.
I
Islam AbdelRahman 已提交
602
  // To implement this flag we would need to do unbuffered reads which
603 604 605 606 607 608
  // will need to be aligned (not sure there is a guarantee that the buffer
  // passed in is aligned).
  // Hence we currently ignore this flag. It is used only in a few cases
  // which should not be perf critical.
  // If perf evaluation finds this to be a problem, we can look into
  // implementing this.
S
sdong 已提交
609
  bool use_os_buffer_;
D
Dmitri Smirnov 已提交
610

S
sdong 已提交
611
 public:
612
  WinSequentialFile(const std::string& fname, HANDLE f,
S
sdong 已提交
613 614 615 616 617 618
                    const EnvOptions& options)
      : filename_(fname),
        file_(f),
        use_os_buffer_(options.use_os_buffer) {}

  virtual ~WinSequentialFile() {
619 620
    assert(file_ != INVALID_HANDLE_VALUE);
    CloseHandle(file_);
D
Dmitri Smirnov 已提交
621 622 623 624 625 626
  }

  virtual Status Read(size_t n, Slice* result, char* scratch) override {
    Status s;
    size_t r = 0;

627 628 629
    // Windows ReadFile API accepts a DWORD.
    // While it is possible to read in a loop if n is > UINT_MAX
    // it is a highly unlikely case.
630
    if (n > UINT_MAX) {
631 632 633 634 635 636 637 638 639 640 641
      return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
    }

    DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
    DWORD bytesRead = 0;
    BOOL ret = ReadFile(file_, scratch, bytesToRead, &bytesRead, NULL);
    if (ret == TRUE) {
      r = bytesRead;
    } else {
      return IOErrorFromWindowsError(filename_, GetLastError());
    }
D
Dmitri Smirnov 已提交
642 643 644 645 646 647 648

    *result = Slice(scratch, r);

    return s;
  }

  virtual Status Skip(uint64_t n) override {
649 650 651 652 653 654 655 656 657 658 659
    // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
    // integer. As such it is a highly unlikley case to have n so large.
    if (n > _I64_MAX) {
      return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
    }

    LARGE_INTEGER li;
    li.QuadPart = static_cast<int64_t>(n); //cast is safe due to the check above
    BOOL ret = SetFilePointerEx(file_, li, NULL, FILE_CURRENT);
    if (ret == FALSE) {
      return IOErrorFromWindowsError(filename_, GetLastError());
D
Dmitri Smirnov 已提交
660 661 662 663
    }
    return Status::OK();
  }

S
sdong 已提交
664 665 666
  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }
D
Dmitri Smirnov 已提交
667 668 669
};

// pread() based random-access
S
sdong 已提交
670 671 672 673
class WinRandomAccessFile : public RandomAccessFile {
  const std::string filename_;
  HANDLE hFile_;
  const bool use_os_buffer_;
S
sdong 已提交
674 675 676
  bool read_ahead_;
  const size_t compaction_readahead_size_;
  const size_t random_access_max_buffer_size_;
S
sdong 已提交
677 678 679 680 681
  mutable std::mutex buffer_mut_;
  mutable AlignedBuffer buffer_;
  mutable uint64_t
      buffered_start_;  // file offset set that is currently buffered

682
  /*
S
sdong 已提交
683 684 685 686 687 688
   * The function reads a requested amount of bytes into the specified aligned
   * buffer Upon success the function sets the length of the buffer to the
   * amount of bytes actually read even though it might be less than actually
   * requested. It then copies the amount of bytes requested by the user (left)
   * to the user supplied buffer (dest) and reduces left by the amount of bytes
   * copied to the user buffer
689 690
   *
   * @user_offset [in] - offset on disk where the read was requested by the user
S
sdong 已提交
691 692 693 694 695 696 697 698 699 700
   * @first_page_start [in] - actual page aligned disk offset that we want to
   *                          read from
   * @bytes_to_read [in] - total amount of bytes that will be read from disk
   *                       which is generally greater or equal to the amount
   *                       that the user has requested due to the
   *                       either alignment requirements or read_ahead in
   *                       effect.
   * @left [in/out] total amount of bytes that needs to be copied to the user
   *                buffer. It is reduced by the amount of bytes that actually
   *                copied
701 702 703 704
   * @buffer - buffer to use
   * @dest - user supplied buffer
  */
  SSIZE_T ReadIntoBuffer(uint64_t user_offset, uint64_t first_page_start,
S
sdong 已提交
705 706
                         size_t bytes_to_read, size_t& left,
                         AlignedBuffer& buffer, char* dest) const {
707 708 709
    assert(buffer.CurrentSize() == 0);
    assert(buffer.Capacity() >= bytes_to_read);

S
sdong 已提交
710 711
    SSIZE_T read =
        pread(hFile_, buffer.Destination(), bytes_to_read, first_page_start);
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729

    if (read > 0) {
      buffer.Size(read);

      // Let's figure out how much we read from the users standpoint
      if ((first_page_start + buffer.CurrentSize()) > user_offset) {
        assert(first_page_start <= user_offset);
        size_t buffer_offset = user_offset - first_page_start;
        read = buffer.Read(dest, buffer_offset, left);
      } else {
        read = 0;
      }
      left -= read;
    }
    return read;
  }

  SSIZE_T ReadIntoOneShotBuffer(uint64_t user_offset, uint64_t first_page_start,
S
sdong 已提交
730 731
                                size_t bytes_to_read, size_t& left,
                                char* dest) const {
732 733 734 735 736
    AlignedBuffer bigBuffer;
    bigBuffer.Alignment(buffer_.Alignment());
    bigBuffer.AllocateNewBuffer(bytes_to_read);

    return ReadIntoBuffer(user_offset, first_page_start, bytes_to_read, left,
S
sdong 已提交
737
                          bigBuffer, dest);
738 739
  }

S
sdong 已提交
740 741 742 743
  SSIZE_T ReadIntoInstanceBuffer(uint64_t user_offset,
                                 uint64_t first_page_start,
                                 size_t bytes_to_read, size_t& left,
                                 char* dest) const {
744
    SSIZE_T read = ReadIntoBuffer(user_offset, first_page_start, bytes_to_read,
S
sdong 已提交
745
                                  left, buffer_, dest);
746 747 748 749 750 751 752 753

    if (read > 0) {
      buffered_start_ = first_page_start;
    }

    return read;
  }

754 755 756 757 758 759 760 761 762 763 764 765
  void CalculateReadParameters(uint64_t offset, size_t bytes_requested,
                                size_t& actual_bytes_toread,
                                uint64_t& first_page_start) const {

    const size_t alignment = buffer_.Alignment();

    first_page_start = TruncateToPageBoundary(alignment, offset);
    const uint64_t last_page_start =
      TruncateToPageBoundary(alignment, offset + bytes_requested - 1);
    actual_bytes_toread = (last_page_start - first_page_start) + alignment;
  }

S
sdong 已提交
766 767 768 769 770 771
 public:
  WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
                      const EnvOptions& options)
      : filename_(fname),
        hFile_(hFile),
        use_os_buffer_(options.use_os_buffer),
772 773 774
        read_ahead_(false),
        compaction_readahead_size_(options.compaction_readahead_size),
        random_access_max_buffer_size_(options.random_access_max_buffer_size),
775
        buffer_(),
S
sdong 已提交
776
        buffered_start_(0) {
D
Dmitri Smirnov 已提交
777 778 779 780
    assert(!options.use_mmap_reads);

    // Unbuffered access, use internal buffer for reads
    if (!use_os_buffer_) {
781 782
      // Do not allocate the buffer either until the first request or
      // until there is a call to allocate a read-ahead buffer
783
      buffer_.Alignment(alignment);
D
Dmitri Smirnov 已提交
784 785
    }
  }
S
sdong 已提交
786 787

  virtual ~WinRandomAccessFile() {
D
Dmitri Smirnov 已提交
788
    if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
S
sdong 已提交
789
      ::CloseHandle(hFile_);
D
Dmitri Smirnov 已提交
790 791 792
    }
  }

S
sdong 已提交
793
  virtual void EnableReadAhead() override { this->Hint(SEQUENTIAL); }
794

S
sdong 已提交
795 796
  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const override {
797

D
Dmitri Smirnov 已提交
798 799 800 801 802
    Status s;
    SSIZE_T r = -1;
    size_t left = n;
    char* dest = scratch;

803 804 805 806 807
    if (n == 0) {
      *result = Slice(scratch, 0);
      return s;
    }

D
Dmitri Smirnov 已提交
808 809 810 811 812
    // When in unbuffered mode we need to do the following changes:
    // - use our own aligned buffer
    // - always read at the offset of that is a multiple of alignment
    if (!use_os_buffer_) {

813 814 815
      uint64_t first_page_start = 0;
      size_t actual_bytes_toread = 0;
      size_t bytes_requested = left;
D
Dmitri Smirnov 已提交
816

817 818 819
      if (!read_ahead_ && random_access_max_buffer_size_ == 0) {
        CalculateReadParameters(offset, bytes_requested, actual_bytes_toread,
          first_page_start);
D
Dmitri Smirnov 已提交
820

821
        assert(actual_bytes_toread > 0);
D
Dmitri Smirnov 已提交
822

823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
        r = ReadIntoOneShotBuffer(offset, first_page_start,
          actual_bytes_toread, left, dest);
      } else {

        std::unique_lock<std::mutex> lock(buffer_mut_);

        // Let's see if at least some of the requested data is already
        // in the buffer
        if (offset >= buffered_start_ &&
          offset < (buffered_start_ + buffer_.CurrentSize())) {
          size_t buffer_offset = offset - buffered_start_;
          r = buffer_.Read(dest, buffer_offset, left);
          assert(r >= 0);

          left -= size_t(r);
          offset += r;
          dest += r;
D
Dmitri Smirnov 已提交
840
        }
S
sdong 已提交
841

842 843 844 845
        // Still some left or none was buffered
        if (left > 0) {
          // Figure out the start/end offset for reading and amount to read
          bytes_requested = left;
D
Dmitri Smirnov 已提交
846

847 848 849 850 851 852 853 854 855 856 857 858 859 860
          if (read_ahead_ && bytes_requested < compaction_readahead_size_) {
            bytes_requested = compaction_readahead_size_;
          }

          CalculateReadParameters(offset, bytes_requested, actual_bytes_toread,
            first_page_start);

          assert(actual_bytes_toread > 0);

          if (buffer_.Capacity() < actual_bytes_toread) {
            // If we are in read-ahead mode or the requested size
            // exceeds max buffer size then use one-shot
            // big buffer otherwise reallocate main buffer
            if (read_ahead_ ||
S
sdong 已提交
861
              (actual_bytes_toread > random_access_max_buffer_size_)) {
862 863 864 865 866 867 868 869 870 871 872 873 874
              // Unlock the mutex since we are not using instance buffer
              lock.unlock();
              r = ReadIntoOneShotBuffer(offset, first_page_start,
                actual_bytes_toread, left, dest);
            }
            else {
              buffer_.AllocateNewBuffer(actual_bytes_toread);
              r = ReadIntoInstanceBuffer(offset, first_page_start,
                actual_bytes_toread, left, dest);
            }
          }
          else {
            buffer_.Clear();
875
            r = ReadIntoInstanceBuffer(offset, first_page_start,
876
              actual_bytes_toread, left, dest);
D
Dmitri Smirnov 已提交
877 878 879 880
          }
        }
      }
    } else {
S
sdong 已提交
881 882 883 884
      r = pread(hFile_, scratch, left, offset);
      if (r > 0) {
        left -= r;
      }
D
Dmitri Smirnov 已提交
885 886 887 888 889 890 891 892 893 894
    }

    *result = Slice(scratch, (r < 0) ? 0 : n - left);

    if (r < 0) {
      s = IOErrorFromLastWindowsError(filename_);
    }
    return s;
  }

895
  virtual bool ShouldForwardRawRequest() const override {
896 897 898
    return true;
  }

899
  virtual void Hint(AccessPattern pattern) override {
S
sdong 已提交
900
    if (pattern == SEQUENTIAL && !use_os_buffer_ &&
901 902 903 904 905 906 907 908
        compaction_readahead_size_ > 0) {
      std::lock_guard<std::mutex> lg(buffer_mut_);
      if (!read_ahead_) {
        read_ahead_ = true;
        // This would allocate read-ahead size + 2 alignments
        // - one for memory alignment which added implicitly by AlignedBuffer
        // - We add one more alignment because we will read one alignment more
        // from disk
S
sdong 已提交
909 910
        buffer_.AllocateNewBuffer(compaction_readahead_size_ +
                                  buffer_.Alignment());
911 912 913 914
      }
    }
  }

S
sdong 已提交
915 916 917
  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }
D
Dmitri Smirnov 已提交
918 919 920
};

// This is a sequential write class. It has been mimicked (as others) after
S
sdong 已提交
921 922 923 924 925 926
// the original Posix class. We add support for unbuffered I/O on windows as
// well
// we utilize the original buffer as an alignment buffer to write directly to
// file with no buffering.
// No buffering requires that the provided buffer is aligned to the physical
// sector size (SSD page size) and
D
Dmitri Smirnov 已提交
927 928
// that all SetFilePointer() operations to occur with such an alignment.
// We thus always write in sector/page size increments to the drive and leave
S
sdong 已提交
929 930
// the tail for the next write OR for Close() at which point we pad with zeros.
// No padding is required for
D
Dmitri Smirnov 已提交
931 932
// buffered access.
class WinWritableFile : public WritableFile {
S
sdong 已提交
933 934
 private:
  const std::string filename_;
935 936 937 938 939 940
  HANDLE            hFile_;
  const bool        use_os_buffer_;  // Used to indicate unbuffered access, the file
  const uint64_t    alignment_;
  // must be opened as unbuffered if false
  uint64_t          filesize_;      // How much data is actually written disk
  uint64_t          reservedsize_;  // how far we have reserved space
D
Dmitri Smirnov 已提交
941

S
sdong 已提交
942 943 944 945 946
 public:
  WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
                  size_t capacity, const EnvOptions& options)
      : filename_(fname),
        hFile_(hFile),
947 948
        use_os_buffer_(options.use_os_buffer),
        alignment_(alignment),
S
sdong 已提交
949
        filesize_(0),
950
        reservedsize_(0) {
S
sdong 已提交
951
    assert(!options.use_mmap_writes);
D
Dmitri Smirnov 已提交
952 953 954 955
  }

  ~WinWritableFile() {
    if (NULL != hFile_ && INVALID_HANDLE_VALUE != hFile_) {
S
sdong 已提交
956
      WinWritableFile::Close();
D
Dmitri Smirnov 已提交
957 958 959
    }
  }

960 961 962 963
  // Indicates if the class makes use of unbuffered I/O
  virtual bool UseOSBuffer() const override {
    return use_os_buffer_;
  }
D
Dmitri Smirnov 已提交
964

965 966 967
  virtual size_t GetRequiredBufferAlignment() const override {
    return alignment_;
  }
D
Dmitri Smirnov 已提交
968

969
  virtual Status Append(const Slice& data) override {
D
Dmitri Smirnov 已提交
970

971 972
    // Used for buffered access ONLY
    assert(use_os_buffer_);
V
Vasili Svirski 已提交
973
    assert(data.size() < std::numeric_limits<DWORD>::max());
D
Dmitri Smirnov 已提交
974

975
    Status s;
D
Dmitri Smirnov 已提交
976

977 978
    DWORD bytesWritten = 0;
    if (!WriteFile(hFile_, data.data(),
V
Vasili Svirski 已提交
979
        static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
980 981 982 983 984 985 986
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError(
        "Failed to WriteFile: " + filename_,
        lastError);
    } else {
      assert(size_t(bytesWritten) == data.size());
      filesize_ += data.size();
D
Dmitri Smirnov 已提交
987 988
    }

989 990
    return s;
  }
D
Dmitri Smirnov 已提交
991

992
  virtual Status PositionedAppend(const Slice& data, uint64_t offset) override {
993
    Status s;
D
Dmitri Smirnov 已提交
994

I
Islam AbdelRahman 已提交
995
    SSIZE_T ret = pwrite(hFile_, data.data(), data.size(), offset);
D
Dmitri Smirnov 已提交
996

997 998 999 1000 1001
    // Error break
    if (ret < 0) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError(
        "Failed to pwrite for: " + filename_, lastError);
D
Dmitri Smirnov 已提交
1002
    } else {
1003 1004 1005 1006
      // With positional write it is not clear at all
      // if this actually extends the filesize
      assert(size_t(ret) == data.size());
      filesize_ += data.size();
D
Dmitri Smirnov 已提交
1007
    }
1008 1009
    return s;
  }
D
Dmitri Smirnov 已提交
1010

1011 1012 1013 1014 1015 1016 1017
  // Need to implement this so the file is truncated correctly
  // when buffered and unbuffered mode
  virtual Status Truncate(uint64_t size) override {
    Status s =  ftruncate(filename_, hFile_, size);
    if (s.ok()) {
      filesize_ = size;
    }
D
Dmitri Smirnov 已提交
1018 1019 1020 1021 1022
    return s;
  }

  virtual Status Close() override {

1023
    Status s;
D
Dmitri Smirnov 已提交
1024

1025
    assert(INVALID_HANDLE_VALUE != hFile_);
D
Dmitri Smirnov 已提交
1026

1027
    if (fsync(hFile_) < 0) {
D
Dmitri Smirnov 已提交
1028
      auto lastError = GetLastError();
S
sdong 已提交
1029
      s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_,
1030
        lastError);
D
Dmitri Smirnov 已提交
1031 1032 1033
    }

    if (FALSE == ::CloseHandle(hFile_)) {
1034 1035 1036
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
                                  lastError);
D
Dmitri Smirnov 已提交
1037 1038 1039 1040 1041 1042 1043
    }

    hFile_ = INVALID_HANDLE_VALUE;
    return s;
  }

  // write out the cached data to the OS cache
1044
  // This is now taken care of the WritableFileWriter
D
Dmitri Smirnov 已提交
1045
  virtual Status Flush() override {
1046
    return Status::OK();
D
Dmitri Smirnov 已提交
1047 1048 1049
  }

  virtual Status Sync() override {
1050
    Status s;
D
Dmitri Smirnov 已提交
1051
    // Calls flush buffers
1052
    if (fsync(hFile_) < 0) {
S
sdong 已提交
1053 1054 1055
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_,
                                  lastError);
D
Dmitri Smirnov 已提交
1056 1057 1058 1059
    }
    return s;
  }

S
sdong 已提交
1060
  virtual Status Fsync() override { return Sync(); }
D
Dmitri Smirnov 已提交
1061 1062

  virtual uint64_t GetFileSize() override {
1063 1064 1065 1066 1067 1068
    // Double accounting now here with WritableFileWriter
    // and this size will be wrong when unbuffered access is used
    // but tests implement their own writable files and do not use WritableFileWrapper
    // so we need to squeeze a square peg through
    // a round hole here.
    return filesize_;
D
Dmitri Smirnov 已提交
1069 1070
  }

1071
  virtual Status Allocate(uint64_t offset, uint64_t len) override {
D
Dmitri Smirnov 已提交
1072
    Status status;
1073
    TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
D
Dmitri Smirnov 已提交
1074 1075 1076 1077

    // Make sure that we reserve an aligned amount of space
    // since the reservation block size is driven outside so we want
    // to check if we are ok with reservation here
1078
    size_t spaceToReserve = Roundup(offset + len, alignment_);
D
Dmitri Smirnov 已提交
1079 1080 1081 1082 1083
    // Nothing to do
    if (spaceToReserve <= reservedsize_) {
      return status;
    }

1084
    IOSTATS_TIMER_GUARD(allocate_nanos);
D
Dmitri Smirnov 已提交
1085 1086 1087 1088 1089 1090 1091 1092 1093
    status = fallocate(filename_, hFile_, spaceToReserve);
    if (status.ok()) {
      reservedsize_ = spaceToReserve;
    }
    return status;
  }
};

class WinDirectory : public Directory {
S
sdong 已提交
1094 1095
 public:
  WinDirectory() {}
D
Dmitri Smirnov 已提交
1096

S
sdong 已提交
1097
  virtual Status Fsync() override { return Status::OK(); }
D
Dmitri Smirnov 已提交
1098 1099 1100
};

class WinFileLock : public FileLock {
S
sdong 已提交
1101 1102
 public:
  explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
D
Dmitri Smirnov 已提交
1103 1104 1105 1106
    assert(hFile != NULL);
    assert(hFile != INVALID_HANDLE_VALUE);
  }

S
sdong 已提交
1107
  ~WinFileLock() {
D
Dmitri Smirnov 已提交
1108 1109 1110 1111
    BOOL ret = ::CloseHandle(hFile_);
    assert(ret);
  }

S
sdong 已提交
1112
 private:
D
Dmitri Smirnov 已提交
1113 1114 1115
  HANDLE hFile_;
};

S
sdong 已提交
1116
namespace {
D
Dmitri Smirnov 已提交
1117 1118 1119 1120 1121 1122 1123 1124 1125

void WinthreadCall(const char* label, std::error_code result) {
  if (0 != result.value()) {
    fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
    abort();
  }
}
}

1126 1127
typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);

D
Dmitri Smirnov 已提交
1128
class WinEnv : public Env {
S
sdong 已提交
1129
 public:
D
Dmitri Smirnov 已提交
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
  WinEnv();

  virtual ~WinEnv() {
    for (auto& th : threads_to_join_) {
      th.join();
    }

    threads_to_join_.clear();

    for (auto& thpool : thread_pools_) {
      thpool.JoinAllThreads();
    }
    // All threads must be joined before the deletion of
    // thread_status_updater_.
    delete thread_status_updater_;
  }

  virtual Status DeleteFile(const std::string& fname) override {
    Status result;

    if (_unlink(fname.c_str())) {
      result = IOError("Failed to delete: " + fname, errno);
    }

    return result;
  }

  Status GetCurrentTime(int64_t* unix_time) override {
    time_t time = std::time(nullptr);
    if (time == (time_t)(-1)) {
      return Status::NotSupported("Failed to get time");
    }

    *unix_time = time;
    return Status::OK();
  }

S
sdong 已提交
1167 1168 1169
  virtual Status NewSequentialFile(const std::string& fname,
                                   std::unique_ptr<SequentialFile>* result,
                                   const EnvOptions& options) override {
D
Dmitri Smirnov 已提交
1170 1171 1172 1173 1174 1175 1176
    Status s;

    result->reset();

    // Corruption test needs to rename and delete files of these kind
    // while they are still open with another handle. For that reason we
    // allow share_write and delete(allows rename).
1177
    HANDLE hFile = INVALID_HANDLE_VALUE;
1178 1179
    {
      IOSTATS_TIMER_GUARD(open_nanos);
S
sdong 已提交
1180 1181 1182 1183 1184
      hFile = CreateFileA(
          fname.c_str(), GENERIC_READ,
          FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
          OPEN_EXISTING,  // Original fopen mode is "rb"
          FILE_ATTRIBUTE_NORMAL, NULL);
1185
    }
D
Dmitri Smirnov 已提交
1186

1187
    if (INVALID_HANDLE_VALUE == hFile) {
D
Dmitri Smirnov 已提交
1188
      auto lastError = GetLastError();
S
sdong 已提交
1189 1190
      s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname,
                                  lastError);
D
Dmitri Smirnov 已提交
1191
    } else {
1192
      result->reset(new WinSequentialFile(fname, hFile, options));
D
Dmitri Smirnov 已提交
1193 1194 1195 1196
    }
    return s;
  }

S
sdong 已提交
1197 1198 1199
  virtual Status NewRandomAccessFile(const std::string& fname,
                                     std::unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& options) override {
D
Dmitri Smirnov 已提交
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
    result->reset();
    Status s;

    // Open the file for read-only random access
    // Random access is to disable read-ahead as the system reads too much data
    DWORD fileFlags = FILE_ATTRIBUTE_READONLY;

    if (!options.use_os_buffer && !options.use_mmap_reads) {
      fileFlags |= FILE_FLAG_NO_BUFFERING;
    } else {
      fileFlags |= FILE_FLAG_RANDOM_ACCESS;
    }

    /// Shared access is necessary for corruption test to pass
1214
    // almost all tests would work with a possible exception of fault_injection
1215
    HANDLE hFile = 0;
1216 1217
    {
      IOSTATS_TIMER_GUARD(open_nanos);
S
sdong 已提交
1218 1219 1220 1221
      hFile =
          CreateFileA(fname.c_str(), GENERIC_READ,
                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
                      NULL, OPEN_EXISTING, fileFlags, NULL);
1222
    }
D
Dmitri Smirnov 已提交
1223 1224 1225

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
S
sdong 已提交
1226 1227
      return IOErrorFromWindowsError(
          "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
D
Dmitri Smirnov 已提交
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241
    }

    UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);

    // CAUTION! This will map the entire file into the process address space
    if (options.use_mmap_reads && sizeof(void*) >= 8) {
      // Use mmap when virtual address-space is plentiful.
      uint64_t fileSize;

      s = GetFileSize(fname, &fileSize);

      if (s.ok()) {
        // Will not map empty files
        if (fileSize == 0) {
S
sdong 已提交
1242 1243
          return IOError(
              "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
D
Dmitri Smirnov 已提交
1244 1245
        }

S
sdong 已提交
1246 1247 1248 1249
        HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY,
                                         0,  // Whole file at its present length
                                         0,
                                         NULL);  // Mapping name
D
Dmitri Smirnov 已提交
1250 1251 1252

        if (!hMap) {
          auto lastError = GetLastError();
S
sdong 已提交
1253 1254 1255
          return IOErrorFromWindowsError(
              "Failed to create file mapping for NewRandomAccessFile: " + fname,
              lastError);
D
Dmitri Smirnov 已提交
1256 1257
        }

S
sdong 已提交
1258
        UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
D
Dmitri Smirnov 已提交
1259

S
sdong 已提交
1260 1261 1262 1263 1264 1265
        const void* mapped_region =
            MapViewOfFileEx(hMap, FILE_MAP_READ,
                            0,  // High DWORD of access start
                            0,  // Low DWORD
                            fileSize,
                            NULL);  // Let the OS choose the mapping
D
Dmitri Smirnov 已提交
1266 1267 1268

        if (!mapped_region) {
          auto lastError = GetLastError();
S
sdong 已提交
1269 1270 1271
          return IOErrorFromWindowsError(
              "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
              lastError);
D
Dmitri Smirnov 已提交
1272 1273
        }

S
sdong 已提交
1274 1275
        result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
                                              fileSize));
D
Dmitri Smirnov 已提交
1276 1277 1278 1279

        mapGuard.release();
        fileGuard.release();
      }
S
sdong 已提交
1280
    } else {
D
Dmitri Smirnov 已提交
1281 1282 1283 1284 1285 1286
      result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
      fileGuard.release();
    }
    return s;
  }

S
sdong 已提交
1287 1288 1289
  virtual Status NewWritableFile(const std::string& fname,
                                 std::unique_ptr<WritableFile>* result,
                                 const EnvOptions& options) override {
D
Dmitri Smirnov 已提交
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
    const size_t c_BufferCapacity = 64 * 1024;

    EnvOptions local_options(options);

    result->reset();
    Status s;

    DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;

    if (!local_options.use_os_buffer && !local_options.use_mmap_writes) {
S
sdong 已提交
1300
      fileFlags = FILE_FLAG_NO_BUFFERING;
D
Dmitri Smirnov 已提交
1301 1302
    }

S
sdong 已提交
1303 1304 1305 1306
    // Desired access. We are want to write only here but if we want to memory
    // map
    // the file then there is no write only mode so we have to create it
    // Read/Write
D
Dmitri Smirnov 已提交
1307 1308 1309 1310 1311 1312 1313
    // However, MapViewOfFile specifies only Write only
    DWORD desired_access = GENERIC_WRITE;
    DWORD shared_mode = FILE_SHARE_READ;

    if (local_options.use_mmap_writes) {
      desired_access |= GENERIC_READ;
    } else {
S
sdong 已提交
1314 1315
      // Adding this solely for tests to pass (fault_injection_test,
      // wal_manager_test).
D
Dmitri Smirnov 已提交
1316 1317 1318
      shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
    }

1319 1320 1321
    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
S
sdong 已提交
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331
      hFile = CreateFileA(
          fname.c_str(),
          desired_access,  // Access desired
          shared_mode,
          NULL,           // Security attributes
          CREATE_ALWAYS,  // Posix env says O_CREAT | O_RDWR | O_TRUNC
          fileFlags,      // Flags
          NULL);          // Template File
    }

D
Dmitri Smirnov 已提交
1332 1333
    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
S
sdong 已提交
1334 1335
      return IOErrorFromWindowsError(
          "Failed to create a NewWriteableFile: " + fname, lastError);
D
Dmitri Smirnov 已提交
1336 1337 1338
    }

    if (options.use_mmap_writes) {
S
sdong 已提交
1339 1340 1341 1342
      // We usually do not use mmmapping on SSD and thus we pass memory
      // page_size
      result->reset(new WinMmapFile(fname, hFile, page_size_,
                                    allocation_granularity_, local_options));
D
Dmitri Smirnov 已提交
1343
    } else {
S
sdong 已提交
1344 1345 1346 1347
      // Here we want the buffer allocation to be aligned by the SSD page size
      // and to be a multiple of it
      result->reset(new WinWritableFile(fname, hFile, page_size_,
                                        c_BufferCapacity, local_options));
D
Dmitri Smirnov 已提交
1348 1349 1350 1351
    }
    return s;
  }

S
sdong 已提交
1352 1353
  virtual Status NewDirectory(const std::string& name,
                              std::unique_ptr<Directory>* result) override {
D
Dmitri Smirnov 已提交
1354 1355 1356 1357 1358 1359 1360
    Status s;
    // Must be nullptr on failure
    result->reset();
    // Must fail if directory does not exist
    if (!DirExists(name)) {
      s = IOError("Directory does not exist: " + name, EEXIST);
    } else {
1361
      IOSTATS_TIMER_GUARD(open_nanos);
D
Dmitri Smirnov 已提交
1362 1363 1364 1365 1366
      result->reset(new WinDirectory);
    }
    return s;
  }

A
agiardullo 已提交
1367
  virtual Status FileExists(const std::string& fname) override {
D
Dmitri Smirnov 已提交
1368 1369
    // F_OK == 0
    const int F_OK_ = 0;
A
agiardullo 已提交
1370 1371
    return _access(fname.c_str(), F_OK_) == 0 ? Status::OK()
                                              : Status::NotFound();
D
Dmitri Smirnov 已提交
1372 1373
  }

S
sdong 已提交
1374 1375
  virtual Status GetChildren(const std::string& dir,
                             std::vector<std::string>* result) override {
D
Dmitri Smirnov 已提交
1376 1377 1378 1379 1380
    std::vector<std::string> output;

    Status status;

    auto CloseDir = [](DIR* p) { closedir(p); };
S
sdong 已提交
1381 1382
    std::unique_ptr<DIR, decltype(CloseDir)> dirp(opendir(dir.c_str()),
                                                  CloseDir);
D
Dmitri Smirnov 已提交
1383 1384 1385

    if (!dirp) {
      status = IOError(dir, errno);
S
sdong 已提交
1386
    } else {
D
Dmitri Smirnov 已提交
1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
      if (result->capacity() > 0) {
        output.reserve(result->capacity());
      }

      struct dirent* ent = readdir(dirp.get());
      while (ent) {
        output.push_back(ent->d_name);
        ent = readdir(dirp.get());
      }
    }

    output.swap(*result);

    return status;
  }

  virtual Status CreateDir(const std::string& name) override {
    Status result;

    if (_mkdir(name.c_str()) != 0) {
S
sdong 已提交
1407 1408
      auto code = errno;
      result = IOError("Failed to create dir: " + name, code);
D
Dmitri Smirnov 已提交
1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422
    }

    return result;
  }

  virtual Status CreateDirIfMissing(const std::string& name) override {
    Status result;

    if (DirExists(name)) {
      return result;
    }

    if (_mkdir(name.c_str()) != 0) {
      if (errno == EEXIST) {
S
sdong 已提交
1423 1424 1425 1426 1427
        result =
            Status::IOError("`" + name + "' exists but is not a directory");
      } else {
        auto code = errno;
        result = IOError("Failed to create dir: " + name, code);
D
Dmitri Smirnov 已提交
1428 1429 1430 1431 1432 1433 1434 1435 1436
      }
    }

    return result;
  }

  virtual Status DeleteDir(const std::string& name) override {
    Status result;
    if (_rmdir(name.c_str()) != 0) {
S
sdong 已提交
1437 1438
      auto code = errno;
      result = IOError("Failed to remove dir: " + name, code);
D
Dmitri Smirnov 已提交
1439 1440 1441 1442
    }
    return result;
  }

S
sdong 已提交
1443 1444 1445
  virtual Status GetFileSize(const std::string& fname,
                             uint64_t* size) override {
    Status s;
D
Dmitri Smirnov 已提交
1446

S
sdong 已提交
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457
    WIN32_FILE_ATTRIBUTE_DATA attrs;
    if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
      ULARGE_INTEGER file_size;
      file_size.HighPart = attrs.nFileSizeHigh;
      file_size.LowPart = attrs.nFileSizeLow;
      *size = file_size.QuadPart;
    } else {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError);
    }
    return s;
D
Dmitri Smirnov 已提交
1458 1459 1460 1461 1462 1463 1464
  }

  static inline uint64_t FileTimeToUnixTime(const FILETIME& ftTime) {
    const uint64_t c_FileTimePerSecond = 10000000U;
    // UNIX epoch starts on 1970-01-01T00:00:00Z
    // Windows FILETIME starts on 1601-01-01T00:00:00Z
    // Therefore, we need to subtract the below number of seconds from
S
sdong 已提交
1465 1466
    // the seconds that we obtain from FILETIME with an obvious loss of
    // precision
D
Dmitri Smirnov 已提交
1467 1468 1469 1470 1471 1472
    const uint64_t c_SecondBeforeUnixEpoch = 11644473600U;

    ULARGE_INTEGER li;
    li.HighPart = ftTime.dwHighDateTime;
    li.LowPart = ftTime.dwLowDateTime;

S
sdong 已提交
1473 1474
    uint64_t result =
        (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
D
Dmitri Smirnov 已提交
1475 1476 1477
    return result;
  }

S
sdong 已提交
1478 1479
  virtual Status GetFileModificationTime(const std::string& fname,
                                         uint64_t* file_mtime) override {
D
Dmitri Smirnov 已提交
1480 1481 1482 1483 1484 1485 1486
    Status s;

    WIN32_FILE_ATTRIBUTE_DATA attrs;
    if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
      *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
    } else {
      auto lastError = GetLastError();
S
sdong 已提交
1487 1488
      s = IOErrorFromWindowsError(
          "Can not get file modification time for: " + fname, lastError);
D
Dmitri Smirnov 已提交
1489 1490 1491 1492 1493 1494
      *file_mtime = 0;
    }

    return s;
  }

S
sdong 已提交
1495 1496
  virtual Status RenameFile(const std::string& src,
                            const std::string& target) override {
D
Dmitri Smirnov 已提交
1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
    Status result;

    // rename() is not capable of replacing the existing file as on Linux
    // so use OS API directly
    if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) {
      DWORD lastError = GetLastError();

      std::string text("Failed to rename: ");
      text.append(src).append(" to: ").append(target);

      result = IOErrorFromWindowsError(text, lastError);
    }

    return result;
  }

S
sdong 已提交
1513 1514
  virtual Status LinkFile(const std::string& src,
                          const std::string& target) override {
D
Dmitri Smirnov 已提交
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
    Status result;

    if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
      DWORD lastError = GetLastError();

      std::string text("Failed to link: ");
      text.append(src).append(" to: ").append(target);

      result = IOErrorFromWindowsError(text, lastError);
    }

    return result;
  }

S
sdong 已提交
1529 1530
  virtual Status LockFile(const std::string& lockFname,
                          FileLock** lock) override {
D
Dmitri Smirnov 已提交
1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541
    assert(lock != nullptr);

    *lock = NULL;
    Status result;

    // No-sharing, this is a LOCK file
    const DWORD ExclusiveAccessON = 0;

    // Obtain exclusive access to the LOCK file
    // Previously, instead of NORMAL attr we set DELETE on close and that worked
    // well except with fault_injection test that insists on deleting it.
1542 1543 1544 1545
    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
      hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE),
S
sdong 已提交
1546 1547
                          ExclusiveAccessON, NULL, CREATE_ALWAYS,
                          FILE_ATTRIBUTE_NORMAL, NULL);
1548
    }
D
Dmitri Smirnov 已提交
1549 1550 1551

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
S
sdong 已提交
1552 1553
      result = IOErrorFromWindowsError(
          "Failed to create lock file: " + lockFname, lastError);
D
Dmitri Smirnov 已提交
1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570
    } else {
      *lock = new WinFileLock(hFile);
    }

    return result;
  }

  virtual Status UnlockFile(FileLock* lock) override {
    Status result;

    assert(lock != nullptr);

    delete lock;

    return result;
  }

S
sdong 已提交
1571
  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW,
1572 1573
                        void* tag = nullptr,
                        void (*unschedFunction)(void* arg) = 0) override;
S
sdong 已提交
1574

D
Dmitri Smirnov 已提交
1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589
  virtual int UnSchedule(void* arg, Priority pri) override;

  virtual void StartThread(void (*function)(void* arg), void* arg) override;

  virtual void WaitForJoin() override;

  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;

  virtual Status GetTestDirectory(std::string* result) override {
    std::string output;

    const char* env = getenv("TEST_TMPDIR");
    if (env && env[0] != '\0') {
      output = env;
      CreateDir(output);
S
sdong 已提交
1590
    } else {
D
Dmitri Smirnov 已提交
1591 1592 1593
      env = getenv("TMP");

      if (env && env[0] != '\0') {
S
sdong 已提交
1594 1595 1596
        output = env;
      } else {
        output = "c:\\tmp";
D
Dmitri Smirnov 已提交
1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612
      }

      CreateDir(output);
    }

    output.append("\\testrocksdb-");
    output.append(std::to_string(_getpid()));

    CreateDir(output);

    output.swap(*result);

    return Status::OK();
  }

  virtual Status GetThreadList(
S
sdong 已提交
1613
      std::vector<ThreadStatus>* thread_list) override {
D
Dmitri Smirnov 已提交
1614 1615 1616 1617 1618 1619 1620 1621 1622
    assert(thread_status_updater_);
    return thread_status_updater_->GetThreadList(thread_list);
  }

  static uint64_t gettid() {
    uint64_t thread_id = GetCurrentThreadId();
    return thread_id;
  }

S
sdong 已提交
1623
  virtual uint64_t GetThreadID() const override { return gettid(); }
D
Dmitri Smirnov 已提交
1624

S
sdong 已提交
1625 1626
  virtual Status NewLogger(const std::string& fname,
                           std::shared_ptr<Logger>* result) override {
D
Dmitri Smirnov 已提交
1627 1628 1629 1630
    Status s;

    result->reset();

1631 1632 1633
    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
S
sdong 已提交
1634 1635 1636 1637 1638 1639 1640 1641 1642
      hFile = CreateFileA(
          fname.c_str(), GENERIC_WRITE,
          FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
                                                // renamed and deleted before
                                                // they are closed. This enables
                                                // doing so.
          NULL,
          CREATE_ALWAYS,  // Original fopen mode is "w"
          FILE_ATTRIBUTE_NORMAL, NULL);
1643
    }
D
Dmitri Smirnov 已提交
1644

1645
    if (INVALID_HANDLE_VALUE == hFile) {
D
Dmitri Smirnov 已提交
1646 1647 1648 1649
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError);
    } else {
      {
S
sdong 已提交
1650 1651 1652 1653
        // With log files we want to set the true creation time as of now
        // because the system
        // for some reason caches the attributes of the previous file that just
        // been renamed from
D
Dmitri Smirnov 已提交
1654 1655 1656 1657 1658 1659
        // this name so auto_roll_logger_test fails
        FILETIME ft;
        GetSystemTimeAsFileTime(&ft);
        // Set creation, last access and last write time to the same value
        SetFileTime(hFile, &ft, &ft, &ft);
      }
1660
      result->reset(new WinLogger(&WinEnv::gettid, this, hFile));
D
Dmitri Smirnov 已提交
1661 1662 1663 1664 1665
    }
    return s;
  }

  virtual uint64_t NowMicros() override {
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688
    if (GetSystemTimePreciseAsFileTime_ != NULL) {
      // all std::chrono clocks on windows proved to return
      // values that may repeat that is not good enough for some uses.
      const int64_t c_UnixEpochStartTicks = 116444736000000000i64;
      const int64_t c_FtToMicroSec = 10;

      // This interface needs to return system time and not
      // just any microseconds because it is often used as an argument
      // to TimedWait() on condition variable
      FILETIME ftSystemTime;
      GetSystemTimePreciseAsFileTime_(&ftSystemTime);

      LARGE_INTEGER li;
      li.LowPart = ftSystemTime.dwLowDateTime;
      li.HighPart = ftSystemTime.dwHighDateTime;
      // Subtract unix epoch start
      li.QuadPart -= c_UnixEpochStartTicks;
      // Convert to microsecs
      li.QuadPart /= c_FtToMicroSec;
      return li.QuadPart;
    }
    using namespace std::chrono;
    return duration_cast<microseconds>(system_clock::now().time_since_epoch()).count();
D
Dmitri Smirnov 已提交
1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704
  }

  virtual uint64_t NowNanos() override {
    // all std::chrono clocks on windows have the same resolution that is only
    // good enough for microseconds but not nanoseconds
    // On Windows 8 and Windows 2012 Server
    // GetSystemTimePreciseAsFileTime(&current_time) can be used
    LARGE_INTEGER li;
    QueryPerformanceCounter(&li);
    // Convert to nanoseconds first to avoid loss of precision
    // and divide by frequency
    li.QuadPart *= std::nano::den;
    li.QuadPart /= perf_counter_frequency_;
    return li.QuadPart;
  }

S
sdong 已提交
1705
  virtual void SleepForMicroseconds(int micros) override {
D
Dmitri Smirnov 已提交
1706 1707 1708 1709 1710
    std::this_thread::sleep_for(std::chrono::microseconds(micros));
  }

  virtual Status GetHostName(char* name, uint64_t len) override {
    Status s;
1711 1712
    DWORD nSize = static_cast<DWORD>(
        std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
D
Dmitri Smirnov 已提交
1713 1714 1715 1716 1717

    if (!::GetComputerNameA(name, &nSize)) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("GetHostName", lastError);
    } else {
S
sdong 已提交
1718
      name[nSize] = 0;
D
Dmitri Smirnov 已提交
1719 1720 1721 1722 1723 1724 1725 1726 1727
    }

    return s;
  }

  virtual Status GetCurrTime(int64_t* unix_time) {
    Status s;

    time_t ret = time(nullptr);
S
sdong 已提交
1728 1729 1730
    if (ret == (time_t)-1) {
      *unix_time = 0;
      s = IOError("GetCurrTime", errno);
D
Dmitri Smirnov 已提交
1731
    } else {
S
sdong 已提交
1732
      *unix_time = (int64_t)ret;
D
Dmitri Smirnov 已提交
1733 1734 1735 1736 1737
    }

    return s;
  }

S
sdong 已提交
1738 1739
  virtual Status GetAbsolutePath(const std::string& db_path,
                                 std::string* output_path) override {
D
Dmitri Smirnov 已提交
1740 1741
    // Check if we already have an absolute path
    // that starts with non dot and has a semicolon in it
S
sdong 已提交
1742 1743 1744 1745 1746 1747
    if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) ||
        (db_path.size() > 2 && db_path[0] != '.' &&
         ((db_path[1] == ':' && db_path[2] == '\\') ||
          (db_path[1] == ':' && db_path[2] == '/')))) {
      *output_path = db_path;
      return Status::OK();
D
Dmitri Smirnov 已提交
1748 1749 1750 1751 1752 1753 1754
    }

    std::string result;
    result.resize(_MAX_PATH);

    char* ret = _getcwd(&result[0], _MAX_PATH);
    if (ret == nullptr) {
S
sdong 已提交
1755 1756
      return Status::IOError("Failed to get current working directory",
                             strerror(errno));
D
Dmitri Smirnov 已提交
1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790
    }

    result.resize(strlen(result.data()));

    result.swap(*output_path);
    return Status::OK();
  }

  // Allow increasing the number of worker threads.
  virtual void SetBackgroundThreads(int num, Priority pri) override {
    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
    thread_pools_[pri].SetBackgroundThreads(num);
  }

  virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
    thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
  }

  virtual std::string TimeToString(uint64_t secondsSince1970) override {
    std::string result;

    const time_t seconds = secondsSince1970;
    const int maxsize = 64;

    struct tm t;
    errno_t ret = localtime_s(&t, &seconds);

    if (ret) {
      result = std::to_string(seconds);
    } else {
      result.resize(maxsize);
      char* p = &result[0];

S
sdong 已提交
1791 1792 1793
      int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ",
                         t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
                         t.tm_min, t.tm_sec);
D
Dmitri Smirnov 已提交
1794 1795 1796 1797 1798 1799 1800 1801
      assert(len > 0);

      result.resize(len);
    }

    return result;
  }

S
sdong 已提交
1802 1803
  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
                                 const DBOptions& db_options) const override {
D
Dmitri Smirnov 已提交
1804 1805 1806
    EnvOptions optimized = env_options;
    optimized.use_mmap_writes = false;
    optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
S
sdong 已提交
1807 1808 1809
    optimized.use_os_buffer =
        true;  // This is because we flush only whole pages on unbuffered io and
               // the last records are not guaranteed to be flushed.
D
Dmitri Smirnov 已提交
1810 1811 1812 1813 1814 1815 1816
    // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
    // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
    // test and make this false
    optimized.fallocate_with_keep_size = true;
    return optimized;
  }

S
sdong 已提交
1817 1818
  EnvOptions OptimizeForManifestWrite(
      const EnvOptions& env_options) const override {
D
Dmitri Smirnov 已提交
1819 1820 1821 1822 1823 1824 1825 1826
    EnvOptions optimized = env_options;
    optimized.use_mmap_writes = false;
    optimized.use_os_buffer = true;
    optimized.fallocate_with_keep_size = true;
    return optimized;
  }

 private:
S
sdong 已提交
1827 1828 1829 1830 1831
  // Returns true iff the named directory exists and is a directory.
  virtual bool DirExists(const std::string& dname) {
    WIN32_FILE_ATTRIBUTE_DATA attrs;
    if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) {
      return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
D
Dmitri Smirnov 已提交
1832
    }
S
sdong 已提交
1833 1834
    return false;
  }
D
Dmitri Smirnov 已提交
1835

S
sdong 已提交
1836
  bool SupportsFastAllocate(const std::string& /* path */) { return false; }
D
Dmitri Smirnov 已提交
1837

S
sdong 已提交
1838 1839 1840 1841 1842 1843 1844 1845
  bool checkedDiskForMmap_;
  bool forceMmapOff;  // do we override Env options?
  size_t page_size_;
  size_t allocation_granularity_;
  uint64_t perf_counter_frequency_;
  std::vector<ThreadPool> thread_pools_;
  mutable std::mutex mu_;
  std::vector<std::thread> threads_to_join_;
1846
  FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
D
Dmitri Smirnov 已提交
1847 1848
};

S
sdong 已提交
1849 1850 1851 1852 1853 1854
WinEnv::WinEnv()
    : checkedDiskForMmap_(false),
      forceMmapOff(false),
      page_size_(4 * 1012),
      allocation_granularity_(page_size_),
      perf_counter_frequency_(0),
1855 1856
      thread_pools_(Priority::TOTAL),
      GetSystemTimePreciseAsFileTime_(NULL) {
1857

1858 1859 1860 1861
  HMODULE module = GetModuleHandle("kernel32.dll");
  if (module != NULL) {
    GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
      module, "GetSystemTimePreciseAsFileTime");
1862 1863
  }

D
Dmitri Smirnov 已提交
1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878
  SYSTEM_INFO sinfo;
  GetSystemInfo(&sinfo);

  page_size_ = sinfo.dwPageSize;
  allocation_granularity_ = sinfo.dwAllocationGranularity;

  {
    LARGE_INTEGER qpf;
    BOOL ret = QueryPerformanceFrequency(&qpf);
    assert(ret == TRUE);
    perf_counter_frequency_ = qpf.QuadPart;
  }

  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
    thread_pools_[pool_id].SetThreadPriority(
S
sdong 已提交
1879
        static_cast<Env::Priority>(pool_id));
D
Dmitri Smirnov 已提交
1880 1881 1882 1883 1884 1885 1886 1887
    // This allows later initializing the thread-local-env of each thread.
    thread_pools_[pool_id].SetHostEnv(this);
  }

  // Protected member of the base class
  thread_status_updater_ = CreateThreadStatusUpdater();
}

S
sdong 已提交
1888
void WinEnv::Schedule(void (*function)(void*), void* arg, Priority pri,
1889
                      void* tag, void (*unschedFunction)(void* arg)) {
D
Dmitri Smirnov 已提交
1890
  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
1891
  thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
D
Dmitri Smirnov 已提交
1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
}

int WinEnv::UnSchedule(void* arg, Priority pri) {
  return thread_pools_[pri].UnSchedule(arg);
}

unsigned int WinEnv::GetThreadPoolQueueLen(Priority pri) const {
  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
  return thread_pools_[pri].GetQueueLen();
}

S
sdong 已提交
1903
namespace {
D
Dmitri Smirnov 已提交
1904
struct StartThreadState {
S
sdong 已提交
1905 1906
  void (*user_function)(void*);
  void* arg;
D
Dmitri Smirnov 已提交
1907 1908 1909 1910
};
}

static void* StartThreadWrapper(void* arg) {
S
sdong 已提交
1911 1912
  std::unique_ptr<StartThreadState> state(
      reinterpret_cast<StartThreadState*>(arg));
D
Dmitri Smirnov 已提交
1913 1914 1915 1916 1917 1918 1919 1920 1921
  state->user_function(state->arg);
  return nullptr;
}

void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
  StartThreadState* state = new StartThreadState;
  state->user_function = function;
  state->arg = arg;
  try {
S
sdong 已提交
1922
    std::thread th(&StartThreadWrapper, state);
D
Dmitri Smirnov 已提交
1923

S
sdong 已提交
1924 1925
    std::lock_guard<std::mutex> lg(mu_);
    threads_to_join_.push_back(std::move(th));
D
Dmitri Smirnov 已提交
1926

S
sdong 已提交
1927 1928
  } catch (const std::system_error& ex) {
    WinthreadCall("start thread", ex.code());
D
Dmitri Smirnov 已提交
1929 1930 1931 1932 1933
  }
}

void WinEnv::WaitForJoin() {
  for (auto& th : threads_to_join_) {
S
sdong 已提交
1934
    th.join();
D
Dmitri Smirnov 已提交
1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
  }

  threads_to_join_.clear();
}

}  // namespace

std::string Env::GenerateUniqueId() {
  std::string result;

  UUID uuid;
  UuidCreateSequential(&uuid);

  RPC_CSTR rpc_str;
  auto status = UuidToStringA(&uuid, &rpc_str);
  assert(status == RPC_S_OK);

  result = reinterpret_cast<char*>(rpc_str);

  status = RpcStringFreeA(&rpc_str);
  assert(status == RPC_S_OK);

  return result;
}

S
sdong 已提交
1960 1961 1962 1963
// We choose to create this on the heap and using std::once for the following
// reasons
// 1) Currently available MS compiler does not implement atomic C++11
// initialization of
D
Dmitri Smirnov 已提交
1964
//    function local statics
S
sdong 已提交
1965 1966 1967 1968
// 2) We choose not to destroy the env because joining the threads from the
// system loader
//    which destroys the statics (same as from DLLMain) creates a system loader
//    dead-lock.
D
Dmitri Smirnov 已提交
1969 1970
//    in this manner any remaining threads are terminated OK.
namespace {
S
sdong 已提交
1971 1972
std::once_flag winenv_once_flag;
Env* envptr;
D
Dmitri Smirnov 已提交
1973 1974 1975 1976 1977 1978 1979 1980
};

Env* Env::Default() {
  std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); });
  return envptr;
}

}  // namespace rocksdb