More changes from upstream.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529

More changes from upstream.
git-svn-id: https://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529
13b72af7 · jorlow@chromium.org · 9df5aa89 · 13b72af7 · 13b72af7 · 13b72af7
9 changed file
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -17,11 +17,14 @@

 // Comma-separated list of operations to run in the specified order
 //   Actual benchmarks:
-//      writeseq    -- write N values in sequential key order
-//      writerandom -- write N values in random key order
-//      writebig    -- write N/1000 100K valuesin random order
-//      readseq     -- read N values sequentially
-//      readrandom  -- read N values in random order
+//      fillseq       -- write N values in sequential key order in async mode
+//      fillrandom    -- write N values in random key order in async mode
+//      overwrite     -- overwrite N values in random key order in async mode
+//      fillsync      -- write N/100 values in random key order in sync mode
+//      fill100K      -- write N/1000 100K values in random order in async mode
+//      readseq       -- read N values sequentially
+//      readreverse   -- read N values in reverse order
+//      readrandom    -- read N values in random order
 //   Meta operations:
 //      compact     -- Compact the entire DB
 //      heapprofile -- Dump a heap profile (if supported by this port)
@@ -30,10 +33,10 @@
 //      tenth       -- divide N by 10 (i.e., following benchmarks are smaller)
 //      normal      -- reset N back to its normal value (1000000)
 static const char* FLAGS_benchmarks =
-    "writeseq,"
-    "writeseq,"
-    "writerandom,"
-    "sync,tenth,tenth,writerandom,nosync,normal,"
+    "fillseq,"
+    "fillrandom,"
+    "overwrite,"
+    "fillsync,"
    "readseq,"
    "readreverse,"
    "readrandom,"
@@ -41,7 +44,7 @@ static const char* FLAGS_benchmarks =
    "readseq,"
    "readreverse,"
    "readrandom,"
-    "writebig";
+    "fill100K";

 // Number of key/values to place in database
 static int FLAGS_num = 1000000;
@@ -51,7 +54,7 @@ static int FLAGS_value_size = 100;

 // Arrange to generate values that shrink to this fraction of
 // their original size after compression
-static double FLAGS_compression_ratio = 0.25;
+static double FLAGS_compression_ratio = 0.5;

 // Print histogram of operation timings
 static bool FLAGS_histogram = false;
@@ -93,6 +96,19 @@ class RandomGenerator {
    return Slice(data_.data() + pos_ - len, len);
  }
 };
+
+static Slice TrimSpace(Slice s) {
+  int start = 0;
+  while (start < s.size() && isspace(s[start])) {
+    start++;
+  }
+  int limit = s.size();
+  while (limit > start && isspace(s[limit-1])) {
+    limit--;
+  }
+  return Slice(s.data() + start, limit - start);
+}
+
 }

 class Benchmark {
@@ -100,7 +116,6 @@ class Benchmark {
  Cache* cache_;
  DB* db_;
  int num_;
-  bool sync_;
  int heap_counter_;
  double start_;
  double last_op_finish_;
@@ -114,6 +129,70 @@ class Benchmark {
  int done_;
  int next_report_;     // When to report next

+  void PrintHeader() {
+    const int kKeySize = 16;
+    PrintEnvironment();
+    fprintf(stdout, "Keys:       %d bytes each\n", kKeySize);
+    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
+            FLAGS_value_size,
+            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+    fprintf(stdout, "Entries:    %d\n", num_);
+    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
+            (((kKeySize + FLAGS_value_size) * num_) / 1048576.0));
+    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
+            (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
+             / 1048576.0));
+    PrintWarnings();
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    fprintf(stdout,
+            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+            );
+#endif
+#ifndef NDEBUG
+    fprintf(stdout,
+            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+  }
+
+  void PrintEnvironment() {
+    fprintf(stderr, "LevelDB:    version %d.%d\n",
+            kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+    time_t now = time(NULL);
+    fprintf(stderr, "Date:       %s", ctime(&now));  // ctime() adds newline
+
+    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo != NULL) {
+      char line[1000];
+      int num_cpus = 0;
+      std::string cpu_type;
+      std::string cache_size;
+      while (fgets(line, sizeof(line), cpuinfo) != NULL) {
+        const char* sep = strchr(line, ':');
+        if (sep == NULL) {
+          continue;
+        }
+        Slice key = TrimSpace(Slice(line, sep - 1 - line));
+        Slice val = TrimSpace(Slice(sep + 1));
+        if (key == "model name") {
+          ++num_cpus;
+          cpu_type = val.ToString();
+        } else if (key == "cache size") {
+          cache_size = val.ToString();
+        }
+      }
+      fclose(cpuinfo);
+      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#endif
+  }
+
  void Start() {
    start_ = Env::Default()->NowMicros() * 1e-6;
    bytes_ = 0;
@@ -164,9 +243,10 @@ class Benchmark {
      snprintf(rate, sizeof(rate), "%5.1f MB/s",
               (bytes_ / 1048576.0) / (finish - start_));
      if (!message_.empty()) {
-        message_.push_back(' ');
+        message_  = std::string(rate) + " " + message_;
+      } else {
+        message_ = rate;
      }
-      message_.append(rate);
    }

    fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
@@ -183,14 +263,16 @@ class Benchmark {
 public:
  enum Order {
    SEQUENTIAL,
-    REVERSE,  // Currently only supported for reads
    RANDOM
  };
+  enum DBState {
+    FRESH,
+    EXISTING
+  };

  Benchmark() : cache_(NewLRUCache(200<<20)),
                db_(NULL),
                num_(FLAGS_num),
-                sync_(false),
                heap_counter_(0),
                bytes_(0),
                rand_(301) {
@@ -210,19 +292,8 @@ class Benchmark {
  }

  void Run() {
-    Options options;
-    options.create_if_missing = true;
-    options.max_open_files = 10000;
-    options.block_cache = cache_;
-    options.write_buffer_size = FLAGS_write_buffer_size;
-
-    Start();
-    Status s = DB::Open(options, "/tmp/dbbench", &db_);
-    Stop("open");
-    if (!s.ok()) {
-      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
-    }
+    PrintHeader();
+    Open();

    const char* benchmarks = FLAGS_benchmarks;
    while (benchmarks != NULL) {
@@ -237,30 +308,30 @@ class Benchmark {
      }

      Start();
-      if (name == Slice("writeseq")) {
-        Write(SEQUENTIAL, num_, FLAGS_value_size);
-      } else if (name == Slice("writerandom")) {
-        Write(RANDOM, num_, FLAGS_value_size);
-      } else if (name == Slice("writebig")) {
-        Write(RANDOM, num_ / 1000, 100 * 1000);
+
+      WriteOptions write_options;
+      write_options.sync = false;
+      if (name == Slice("fillseq")) {
+        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size);
+      } else if (name == Slice("fillrandom")) {
+        Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size);
+      } else if (name == Slice("overwrite")) {
+        Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size);
+      } else if (name == Slice("fillsync")) {
+        write_options.sync = true;
+        Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size);
+      } else if (name == Slice("fill100K")) {
+        Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000);
      } else if (name == Slice("readseq")) {
-        Read(SEQUENTIAL);
+        ReadSequential();
      } else if (name == Slice("readreverse")) {
-        Read(REVERSE);
+        ReadReverse();
      } else if (name == Slice("readrandom")) {
-        Read(RANDOM);
+        ReadRandom();
      } else if (name == Slice("compact")) {
        Compact();
      } else if (name == Slice("heapprofile")) {
        HeapProfile();
-      } else if (name == Slice("sync")) {
-        sync_ = true;
-      } else if (name == Slice("nosync")) {
-        sync_ = false;
-      } else if (name == Slice("tenth")) {
-        num_ = num_ / 10;
-      } else if (name == Slice("normal")) {
-        num_ = FLAGS_num;
      } else {
        fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
      }
@@ -268,16 +339,44 @@ class Benchmark {
    }
  }

-  void Write(Order order, int num_entries, int value_size) {
+ private:
+  void Open() {
+    assert(db_ == NULL);
+    Options options;
+    options.create_if_missing = true;
+    options.max_open_files = 10000;
+    options.block_cache = cache_;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    Status s = DB::Open(options, "/tmp/dbbench", &db_);
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Write(const WriteOptions& options, Order order, DBState state,
+             int num_entries, int value_size) {
+    if (state == FRESH) {
+      delete db_;
+      db_ = NULL;
+      DestroyDB("/tmp/dbbench", Options());
+      Open();
+      Start();  // Do not count time taken to destroy/open
+    }
+
+    if (num_entries != num_) {
+      char msg[100];
+      snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
+      message_ = msg;
+    }
+
    WriteBatch batch;
    Status s;
    std::string val;
-    WriteOptions options;
-    options.sync = sync_;
    for (int i = 0; i < num_entries; i++) {
      const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
      char key[100];
-      snprintf(key, sizeof(key), "%012d", k);
+      snprintf(key, sizeof(key), "%016d", k);
      batch.Clear();
      batch.Put(key, gen_.Generate(value_size));
      s = db_->Write(options, &batch);
@@ -290,42 +389,37 @@ class Benchmark {
    }
  }

-  void Read(Order order) {
+  void ReadSequential() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    int i = 0;
+    for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+      bytes_ += iter->key().size() + iter->value().size();
+      FinishedSingleOp();
+      ++i;
+    }
+    delete iter;
+  }
+
+  void ReadReverse() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    int i = 0;
+    for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
+      bytes_ += iter->key().size() + iter->value().size();
+      FinishedSingleOp();
+      ++i;
+    }
+    delete iter;
+  }
+
+  void ReadRandom() {
    ReadOptions options;
-    switch (order) {
-      case SEQUENTIAL: {
-        Iterator* iter = db_->NewIterator(options);
-        int i = 0;
-        for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
-          bytes_ += iter->key().size() + iter->value().size();
-          FinishedSingleOp();
-          ++i;
-        }
-        delete iter;
-        break;
-      }
-      case REVERSE: {
-        Iterator* iter = db_->NewIterator(options);
-        int i = 0;
-        for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
-          bytes_ += iter->key().size() + iter->value().size();
-          FinishedSingleOp();
-          ++i;
-        }
-        delete iter;
-        break;
-      }
-      case RANDOM: {
-        std::string value;
-        for (int i = 0; i < num_; i++) {
-          char key[100];
-          const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
-          snprintf(key, sizeof(key), "%012d", k);
-          db_->Get(options, key, &value);
-          FinishedSingleOp();
-        }
-        break;
-      }
+    std::string value;
+    for (int i = 0; i < num_; i++) {
+      char key[100];
+      const int k = rand_.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d", k);
+      db_->Get(options, key, &value);
+      FinishedSingleOp();
    }
  }


--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -532,8 +532,9 @@ void DBImpl::BackgroundCompaction() {
  }

  Status status;
-  if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) {
+  if (c->IsTrivialMove()) {
    // Move file to next level
+    assert(c->num_input_files(0) == 1);
    FileMetaData* f = c->input(0, 0);
    c->edit()->DeleteFile(c->level(), f->number);
    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
@@ -718,8 +719,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
  bool has_current_user_key = false;
  SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
  for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
-    // Handle key/value, add to state, etc.
    Slice key = input->key();
+    InternalKey tmp_internal_key;
+    tmp_internal_key.DecodeFrom(key);
+    if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
+        compact->builder != NULL) {
+      status = FinishCompactionOutputFile(compact, input);
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
    bool drop = false;
    if (!ParseInternalKey(key, &ikey)) {
      // Do not hide error keys
@@ -855,6 +866,11 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
  return NewInternalIterator(ReadOptions(), &ignored);
 }

+int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+  MutexLock l(&mutex_);
+  return versions_->MaxNextLevelOverlappingBytes();
+}
+
 Status DBImpl::Get(const ReadOptions& options,
                   const Slice& key,
                   std::string* value) {

--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -55,6 +55,10 @@ class DBImpl : public DB {
  // The returned iterator should be deleted when no longer needed.
  Iterator* TEST_NewInternalIterator();

+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64 TEST_MaxNextLevelOverlappingBytes();
+
 private:
  friend class DB;


--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -72,15 +72,19 @@ class DBTest {
  }

  Status Put(const std::string& k, const std::string& v) {
+    WriteOptions options;
+    options.sync = false;
    WriteBatch batch;
    batch.Put(k, v);
-    return db_->Write(WriteOptions(), &batch);
+    return db_->Write(options, &batch);
  }

  Status Delete(const std::string& k) {
+    WriteOptions options;
+    options.sync = false;
    WriteBatch batch;
    batch.Delete(k);
-    return db_->Write(WriteOptions(), &batch);
+    return db_->Write(options, &batch);
  }

  std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
@@ -176,6 +180,35 @@ class DBTest {
    fprintf(stderr, "Found %d live large value files\n", (int)live.size());
    return live;
  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    dbfull()->TEST_CompactMemTable();
+    int max_level_with_files = 1;
+    for (int level = 1; level < config::kNumLevels; level++) {
+      uint64_t v;
+      char name[100];
+      snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
+      if (dbfull()->GetProperty(name, &v) && v > 0) {
+        max_level_with_files = level;
+      }
+    }
+    for (int level = 0; level < max_level_with_files; level++) {
+      dbfull()->TEST_CompactRange(level, "", "~");
+    }
+  }
+
+  void DumpFileCounts(const char* label) {
+    fprintf(stderr, "---\n%s:\n", label);
+    fprintf(stderr, "maxoverlap: %lld\n",
+            static_cast<long long>(
+                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+    for (int level = 0; level < config::kNumLevels; level++) {
+      int num = NumTableFilesAtLevel(level);
+      if (num > 0) {
+        fprintf(stderr, "  level %3d : %d files\n", level, num);
+      }
+    }
+  }
 };

 TEST(DBTest, Empty) {
@@ -315,6 +348,43 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
  }
 }

+TEST(DBTest, SparseMerge) {
+  Options options;
+  options.compression = kNoCompression;
+  Reopen(&options);
+
+  // Suppose there is:
+  //    small amount of data with prefix A
+  //    large amount of data with prefix B
+  //    small amount of data with prefix C
+  // and that recent updates have made small changes to all three prefixes.
+  // Check that we do not do a compaction that merges all of B in one shot.
+  const std::string value(1000, 'x');
+  Put("A", "va");
+  // Write approximately 100MB of "B" values
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(key, value);
+  }
+  Put("C", "vc");
+  Compact("", "z");
+
+  // Make sparse update
+  Put("A",    "va2");
+  Put("B100", "bvalue2");
+  Put("C",    "vc2");
+  dbfull()->TEST_CompactMemTable();
+
+  // Compactions should not cause us to create a situation where
+  // a file overlaps too much data at the next level.
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  dbfull()->TEST_CompactRange(0, "", "z");
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  dbfull()->TEST_CompactRange(1, "", "z");
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+}
+
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
  bool result = (val >= low) && (val <= high);
  if (!result) {

--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -20,6 +20,10 @@

 namespace leveldb {

+// Maximum number of overlaps in grandparent (i.e., level+2) before we
+// stop building a single file in a level->level+1 compaction.
+static const int kMaxGrandParentFiles = 10;
+
 static double MaxBytesForLevel(int level) {
  if (level == 0) {
    return 4 * 1048576.0;
@@ -509,7 +513,7 @@ Status VersionSet::Finalize(Version* v) {
  double best_score = -1;

  Status s;
-  for (int level = 0; s.ok() && level < config::kNumLevels; level++) {
+  for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
    s = SortLevel(v, level);

    // Compute the ratio of current size to size limit.
@@ -751,6 +755,25 @@ void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
  }
 }

+int64 VersionSet::MaxNextLevelOverlappingBytes() {
+  int64 result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 0; level < config::kNumLevels - 1; level++) {
+    for (int i = 0; i < current_->files_[level].size(); i++) {
+      const FileMetaData* f = current_->files_[level][i];
+      GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
+      int64 sum = 0;
+      for (int j = 0; j < overlaps.size(); j++) {
+        sum += overlaps[j]->file_size;
+      }
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
 // Store in "*inputs" all files in "level" that overlap [begin,end]
 void VersionSet::GetOverlappingInputs(
    int level,
@@ -797,6 +820,18 @@ void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
  }
 }

+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+                           const std::vector<FileMetaData*>& inputs2,
+                           InternalKey* smallest,
+                           InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
  ReadOptions options;
  options.verify_checksums = options_->paranoid_checks;
@@ -836,6 +871,7 @@ Compaction* VersionSet::PickCompaction() {
  }
  const int level = current_->compaction_level_;
  assert(level >= 0);
+  assert(level+1 < config::kNumLevels);

  Compaction* c = new Compaction(level);
  c->input_version_ = current_;
@@ -855,31 +891,36 @@ Compaction* VersionSet::PickCompaction() {
    c->inputs_[0].push_back(current_->files_[level][0]);
  }

-  // Find the range we are compacting
-  InternalKey smallest, largest;
-  GetRange(c->inputs_[0], &smallest, &largest);
-
  // Files in level 0 may overlap each other, so pick up all overlapping ones
  if (level == 0) {
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
    // Note that the next call will discard the file we placed in
    // c->inputs_[0] earlier and replace it with an overlapping set
    // which will include the picked file.
    GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
    assert(!c->inputs_[0].empty());
-    GetRange(c->inputs_[0], &smallest, &largest);
  }

+  SetupOtherInputs(c);
+
+  return c;
+}
+
+void VersionSet::SetupOtherInputs(Compaction* c) {
+  const int level = c->level();
+  InternalKey smallest, largest;
+  GetRange(c->inputs_[0], &smallest, &largest);
+
  GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);

+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
  // See if we can grow the number of inputs in "level" without
  // changing the number of "level+1" files we pick up.
  if (!c->inputs_[1].empty()) {
-    // Get entire range covered by compaction
-    std::vector<FileMetaData*> all = c->inputs_[0];
-    all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end());
-    InternalKey all_start, all_limit;
-    GetRange(all, &all_start, &all_limit);
-
    std::vector<FileMetaData*> expanded0;
    GetOverlappingInputs(level, all_start, all_limit, &expanded0);
    if (expanded0.size() > c->inputs_[0].size()) {
@@ -899,10 +940,17 @@ Compaction* VersionSet::PickCompaction() {
        largest = new_limit;
        c->inputs_[0] = expanded0;
        c->inputs_[1] = expanded1;
+        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
      }
    }
  }

+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < config::kNumLevels) {
+    GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
+  }
+
  if (false) {
    Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
        level,
@@ -916,8 +964,6 @@ Compaction* VersionSet::PickCompaction() {
  // key range next time.
  compact_pointer_[level] = largest.Encode().ToString();
  c->edit_.SetCompactPointer(level, largest);
-
-  return c;
 }

 Compaction* VersionSet::CompactRange(
@@ -934,25 +980,16 @@ Compaction* VersionSet::CompactRange(
  c->input_version_ = current_;
  c->input_version_->Ref();
  c->inputs_[0] = inputs;
-
-  // Find the range we are compacting
-  InternalKey smallest, largest;
-  GetRange(c->inputs_[0], &smallest, &largest);
-
-  GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
-  if (false) {
-    Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
-        level,
-        EscapeString(smallest.Encode()).c_str(),
-        EscapeString(largest.Encode()).c_str());
-  }
+  SetupOtherInputs(c);
  return c;
 }

 Compaction::Compaction(int level)
    : level_(level),
      max_output_file_size_(MaxFileSizeForLevel(level)),
-      input_version_(NULL) {
+      input_version_(NULL),
+      grandparent_index_(0),
+      output_start_(-1) {
  for (int i = 0; i < config::kNumLevels; i++) {
    level_ptrs_[i] = 0;
  }
@@ -964,6 +1001,15 @@ Compaction::~Compaction() {
  }
 }

+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there are lots of overlapping grandparent files.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  return (num_input_files(0) == 1
+          && num_input_files(1) == 0
+          && grandparents_.size() <= kMaxGrandParentFiles);
+}
+
 void Compaction::AddInputDeletions(VersionEdit* edit) {
  for (int which = 0; which < 2; which++) {
    for (int i = 0; i < inputs_[which].size(); i++) {
@@ -993,6 +1039,28 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
  return true;
 }

+bool Compaction::ShouldStopBefore(const InternalKey& key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
+    grandparent_index_++;
+  }
+
+  // First call?
+  if (output_start_ < 0) {
+    output_start_ = grandparent_index_;
+  }
+
+  if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) {
+    // Too many overlaps for current output; start new output
+    output_start_ = grandparent_index_;
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void Compaction::ReleaseInputs() {
  if (input_version_ != NULL) {
    input_version_->Unref();

--- a/db/version_set.h
+++ b/db/version_set.h
@@ -139,6 +139,10 @@ class VersionSet {
      const InternalKey& begin,
      const InternalKey& end);

+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64 MaxNextLevelOverlappingBytes();
+
  // Create an iterator that reads over the compaction inputs for "*c".
  // The caller should delete the iterator when no longer needed.
  Iterator* MakeInputIterator(Compaction* c);
@@ -195,6 +199,13 @@ class VersionSet {
                InternalKey* smallest,
                InternalKey* largest);

+  void GetRange2(const std::vector<FileMetaData*>& inputs1,
+                 const std::vector<FileMetaData*>& inputs2,
+                 InternalKey* smallest,
+                 InternalKey* largest);
+
+  void SetupOtherInputs(Compaction* c);
+
  Env* const env_;
  const std::string dbname_;
  const Options* const options_;
@@ -250,6 +261,10 @@ class Compaction {
  // Maximum size of files to build during this compaction.
  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }

+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
  // Add all inputs to this compaction as delete operations to *edit.
  void AddInputDeletions(VersionEdit* edit);

@@ -258,6 +273,10 @@ class Compaction {
  // in levels greater than "level+1".
  bool IsBaseLevelForKey(const Slice& user_key);

+  // Returns true iff we should stop building the current output
+  // before processing "key".
+  bool ShouldStopBefore(const InternalKey& key);
+
  // Release the input version for the compaction, once the compaction
  // is successful.
  void ReleaseInputs();
@@ -276,6 +295,12 @@ class Compaction {
  // Each compaction reads inputs from "level_" and "level_+1"
  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs

+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  int grandparent_index_;   // Index in grandparent_starts_
+  int output_start_;        // Index in grandparent_starts_ where output started
+
  // State for implementing IsBaseLevelForKey

  // level_ptrs_ holds indices into input_version_->levels_: our state

--- a/doc/impl.html
+++ b/doc/impl.html
@@ -123,8 +123,14 @@ one level-0 file in case some of these files overlap each other.
 A compaction merges the contents of the picked files to produce a
 sequence of level-(L+1) files.  We switch to producing a new
 level-(L+1) file after the current output file has reached the target
-file size (2MB).  The old files are discarded and the new files are
-added to the serving state.
+file size (2MB).  We also switch to a new output file when the key
+range of the current output file has grown enough to overlap more then
+ten level-(L+2) files.  This last rule ensures that a later compaction
+of a level-(L+1) file will not pick up too much data from level-(L+2).
+
+<p>
+The old files are discarded and the new files are added to the serving
+state.

 <p>
 Compactions for a particular level rotate through the key space.  In

--- a/include/db.h
+++ b/include/db.h
@@ -12,6 +12,9 @@

 namespace leveldb {

+static const int kMajorVersion = 1;
+static const int kMinorVersion = 0;
+
 struct Options;
 struct ReadOptions;
 struct WriteOptions;

--- a/leveldb.gyp
+++ b/leveldb.gyp
@@ -4,7 +4,7 @@

 {
  'variables': {
-    'use_snappy%': 1,
+    'use_snappy%': 0,
  },
  'target_defaults': {
    'defines': [