diff --git a/.clang-format b/.clang-format
index a1e9a48e4075b2968ca56267339e19f098fb485a..7c279811ac1c22459fd2f0c5d07e641e422739fa 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,46 +2,4 @@
 # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
 BasedOnStyle: Google
-AccessModifierOffset: -1
-ConstructorInitializerIndentWidth: 4
-AlignEscapedNewlinesLeft: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakTemplateDeclarations: true
-AlwaysBreakBeforeMultilineStrings: true
-BreakBeforeBinaryOperators: false
-BreakConstructorInitializersBeforeComma: false
-BinPackParameters: false
-ColumnLimit:     80
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-DerivePointerBinding: true
-ExperimentalAutoDetectBinPacking: true
-IndentCaseLabels: false
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 10
-PenaltyBreakComment: 60
-PenaltyBreakString: 1000
-PenaltyBreakFirstLessLess: 20
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerBindsToType: true
-SpacesBeforeTrailingComments: 2
-Cpp11BracedListStyle: true
-Standard:        Cpp11
-IndentWidth:     2
-TabWidth:        8
-UseTab:          Never
-BreakBeforeBraces: Attach
-IndentFunctionDeclarationAfterType: false
-SpacesInParentheses: false
-SpacesInAngles: false
-SpaceInEmptyParentheses: false
-SpacesInCStyleCastParentheses: false
-SpaceAfterControlStatementKeyword: true
-SpaceBeforeAssignmentOperators: true
-ContinuationIndentWidth: 4
 ...
diff --git a/Makefile b/Makefile
index 1412489c2a5a8faf85279cfc9848c8c7ee710f77..b770ca7b075744844370124b2951f05872554ac7 100644
--- a/Makefile
+++ b/Makefile
@@ -126,19 +126,22 @@ $(SHARED2): $(SHARED3)
 	ln -fs $(SHARED3) $(SHARED2)
 endif
 
-$(SHARED3):
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS)
+$(SHARED3): $(LIBOBJECTS)
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
 
 endif  # PLATFORM_SHARED_EXT
 
 all: $(LIBRARY) $(PROGRAMS)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
-	release tags valgrind_check whitebox_crash_test
+	release tags valgrind_check whitebox_crash_test format
 
+# Will also generate shared libraries. 
 release:
 	$(MAKE) clean
 	OPT="-DNDEBUG -O2" $(MAKE) -j32
+	OPT="-DNDEBUG -O2" $(MAKE) all -j32
+	OPT="-DNDEBUG -O2" $(MAKE) $(SHARED) -j32
 
 coverage:
 	$(MAKE) clean
@@ -195,6 +198,9 @@ tags:
 	ctags * -R
 	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
 
+format:
+	build_tools/format-diff.sh
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@@ -416,6 +422,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
 
 depend: $(DEPFILES)
 
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
 ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
 -include $(DEPFILES)
 endif
+endif
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 87c4c871dcfd2212f9bfb43d1f0e7c05070e1d01..8e83ae497e697c38b8ced4beb2963ce6437d6152 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -81,9 +81,9 @@ PLATFORM_CCFLAGS=
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
 PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
 PLATFORM_SHARED_EXT="so"
-PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
-PLATFORM_SHARED_VERSIONED=true
+PLATFORM_SHARED_VERSIONED=false
 
 # generic port files (working on all platform by #ifdef) go directly in /port
 GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index ae2bb57da52c1de82d5d608d2c88ba79e511278d..e8c9f090b9c6266490e78e1fde357aef438f4a58 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
 RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
 
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
new file mode 100755
index 0000000000000000000000000000000000000000..758135c9f8b5d150b14ec525422d7338bbd6b6ba
--- /dev/null
+++ b/build_tools/format-diff.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+set -e
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl https://fburl.com/clang-format-diff"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+
+# Check the format of recently changed lines,
+diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
index b0c130e3cf32ca911eb6999956c37ab894b35c41..d38b67c3ce7ded61c5589516f00487103a61066b 100755
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -50,7 +50,7 @@ make release
     --num=$NUM \
     --writes=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -68,7 +68,7 @@ make release
     --num=$NUM \
     --writes=$((NUM / 10)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4  \
+    --cache_numshardbits=6  \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -87,7 +87,7 @@ make release
     --num=$NUM \
     --writes=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -106,7 +106,7 @@ make release
     --num=$NUM \
     --reads=$((NUM / 5)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -126,7 +126,7 @@ make release
     --num=$NUM \
     --reads=$((NUM / 5)) \
     --cache_size=104857600 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -147,7 +147,7 @@ make release
     --reads=$((NUM / 5)) \
     --writes=512 \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --write_buffer_size=1000000000 \
     --open_files=55000 \
@@ -169,7 +169,7 @@ make release
     --num=$((NUM / 4)) \
     --writes=$((NUM / 4)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --statistics=1 \
@@ -179,6 +179,25 @@ make release
     --sync=0 \
     --threads=1 > /dev/null
 
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
 # measure readrandom after load with filluniquerandom with 6GB block cache
 ./db_bench \
     --benchmarks=readrandom \
@@ -188,7 +207,7 @@ make release
     --num=$((NUM / 4)) \
     --reads=$((NUM / 4)) \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --open_files=55000 \
     --disable_seek_compaction=1 \
@@ -200,6 +219,28 @@ make release
     --sync=0 \
     --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
 
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --writes_per_second=1000 \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
 # measure memtable performance -- none of the data gets flushed to disk
 ./db_bench \
     --benchmarks=fillrandom,readrandom, \
@@ -208,7 +249,7 @@ make release
     --num=$((NUM / 10)) \
     --reads=$NUM \
     --cache_size=6442450944 \
-    --cache_numshardbits=4 \
+    --cache_numshardbits=6 \
     --table_cache_numshardbits=4 \
     --write_buffer_size=1000000000 \
     --open_files=55000 \
@@ -264,3 +305,4 @@ send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_m
 send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
 send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
 send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
diff --git a/db/builder.cc b/db/builder.cc
index 53d2f8985390832978722df412fb94f2449071a7..930fc180aefc054d468d6dcea678194abec93c38 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -42,7 +42,7 @@ Status BuildTable(const std::string& dbname,
                   const Comparator* user_comparator,
                   const SequenceNumber newest_snapshot,
                   const SequenceNumber earliest_seqno_in_memtable,
-                  const bool enable_compression) {
+                  const CompressionType compression) {
   Status s;
   meta->file_size = 0;
   meta->smallest_seqno = meta->largest_seqno = 0;
@@ -65,7 +65,7 @@ Status BuildTable(const std::string& dbname,
     }
 
     TableBuilder* builder = GetTableBuilder(options, file.get(),
-                                            options.compression);
+                                            compression);
 
     // the first key is the smallest key
     Slice key = iter->key();
diff --git a/db/builder.h b/db/builder.h
index 8c525bd05042fd2448fdf21991ff1da826226cf5..2600dc24b8bd20d348808af5e9400b29bcdacc54 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -43,6 +43,6 @@ extern Status BuildTable(const std::string& dbname,
                          const Comparator* user_comparator,
                          const SequenceNumber newest_snapshot,
                          const SequenceNumber earliest_seqno_in_memtable,
-                         const bool enable_compression);
+                         const CompressionType compression);
 
 }  // namespace rocksdb
diff --git a/db/c.cc b/db/c.cc
index 36ee2d486bd9f5b8d0ca69f53a208c18c4ae1413..68f361336752679a41e6b009de08712d6004e542 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -788,6 +788,10 @@ void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
   env->rep->SetBackgroundThreads(n);
 }
 
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
 void rocksdb_env_destroy(rocksdb_env_t* env) {
   if (!env->is_default) delete env->rep;
   delete env;
diff --git a/db/compaction.cc b/db/compaction.cc
new file mode 100644
index 0000000000000000000000000000000000000000..703e7aeaeb6652dca3589c090630e2a68d35278e
--- /dev/null
+++ b/db/compaction.cc
@@ -0,0 +1,214 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  if (input_version_->vset_->options_->compaction_style ==
+      kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  if (input_version_->vset_->options_->compaction_style  ==
+         kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  int num_levels = input_version_->vset_->NumberLevels();
+  for (int i = output_level() + 1; i < num_levels; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+static void InputSummary(std::vector<FileMetaData*>& files, char* output,
+                         int len) {
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret = snprintf(output + write, sz, "%lu(%lu) ",
+        (unsigned long)files.at(i)->number,
+        (unsigned long)files.at(i)->file_size);
+    if (ret < 0 || ret >= sz)
+      break;
+    write += ret;
+  }
+}
+
+void Compaction::Summary(char* output, int len) {
+  int write = snprintf(output, len,
+      "Base version %lu Base level %d, seek compaction:%d, inputs:",
+      (unsigned long)input_version_->GetVersionNumber(),
+      level_,
+      seek_compaction_);
+  if (write < 0 || write > len) {
+    return;
+  }
+
+  char level_low_summary[100];
+  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
+  char level_up_summary[100];
+  if (inputs_[1].size()) {
+    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
+  } else {
+    level_up_summary[0] = '\0';
+  }
+
+  snprintf(output + write, len - write, "[%s],[%s]",
+      level_low_summary, level_up_summary);
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction.h b/db/compaction.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc0197da8a8dff21bb7a06b4a6d8aa49f04725a
--- /dev/null
+++ b/db/compaction.h
@@ -0,0 +1,131 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+class Version;
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t maxGrandParentOverlapBytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
diff --git a/db/db_bench.cc b/db/db_bench.cc
index de57326a6e4be24ae97de5a6aab95d36e201baf6..6636278370b1b5e17cca41bade5af430dd99d42c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -95,6 +95,8 @@ DEFINE_string(benchmarks,
               "\tmergerandom   -- same as updaterandom/appendrandom using merge"
               " operator. "
               "Must be used with merge_operator\n"
+              "\treadrandommergerandom -- perform N random read-or-merge "
+              "operations. Must be used with merge_operator\n"
               "\tseekrandom    -- N random seeks\n"
               "\tcrc32c        -- repeated crc32c of 4K of data\n"
               "\tacquireload   -- load N*1000 times\n"
@@ -113,6 +115,11 @@ DEFINE_int64(numdistinct, 1000,
              "read/write on fewer keys so that gets are more likely to find the"
              " key and puts are more likely to update the same key");
 
+DEFINE_int64(merge_keys, -1,
+             "Number of distinct keys to use for MergeRandom and "
+             "ReadRandomMergeRandom. "
+             "If negative, there will be FLAGS_num keys.");
+
 DEFINE_int64(reads, -1, "Number of read operations to do.  "
              "If negative, do FLAGS_num reads.");
 
@@ -298,6 +305,11 @@ DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
              "default value 90 means 90% operations out of all reads and writes"
              " operations are reads. In other words, 9 gets for every 1 put.");
 
+DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
+             " as percentage) for the ReadRandomMergeRandom workload. The"
+             " default value 70 means 70% out of all read and merge operations"
+             " are merges. In other words, 7 merges for every 3 gets.");
+
 DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
              "deletes (used in RandomWithVerify only). RandomWithVerify "
              "calculates writepercent as (100 - FLAGS_readwritepercent - "
@@ -449,6 +461,9 @@ DEFINE_uint64(bytes_per_sync,  rocksdb::Options().bytes_per_sync,
 DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop"
             " the delete if key not present");
 
+DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
+             " operations on a key in the memtable");
+
 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
   if (value < 0 || value>=2000000000) {
     fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
@@ -788,6 +803,7 @@ class Benchmark {
   long long reads_;
   long long writes_;
   long long readwrites_;
+  long long merge_keys_;
   int heap_counter_;
   char keyFormat_[100]; // will contain the format of key. e.g "%016d"
   void PrintHeader() {
@@ -963,6 +979,7 @@ class Benchmark {
     readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
                 ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
                ),
+    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
     heap_counter_(0) {
     std::vector<std::string> files;
     FLAGS_env->GetChildren(FLAGS_db, &files);
@@ -990,8 +1007,8 @@ class Benchmark {
   }
 
   unique_ptr<char []> GenerateKeyFromInt(long long v, const char* suffix = "") {
-    unique_ptr<char []> keyInStr(new char[kMaxKeySize]);
-    snprintf(keyInStr.get(), kMaxKeySize, keyFormat_, v, suffix);
+    unique_ptr<char []> keyInStr(new char[kMaxKeySize + 1]);
+    snprintf(keyInStr.get(), kMaxKeySize + 1, keyFormat_, v, suffix);
     return keyInStr;
   }
 
@@ -1092,6 +1109,14 @@ class Benchmark {
         method = &Benchmark::ReadWhileWriting;
       } else if (name == Slice("readrandomwriterandom")) {
         method = &Benchmark::ReadRandomWriteRandom;
+      } else if (name == Slice("readrandommergerandom")) {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.ToString().c_str());
+          method = nullptr;
+        } else {
+          method = &Benchmark::ReadRandomMergeRandom;
+        }
       } else if (name == Slice("updaterandom")) {
         method = &Benchmark::UpdateRandom;
       } else if (name == Slice("appendrandom")) {
@@ -1427,6 +1452,7 @@ class Benchmark {
               FLAGS_merge_operator.c_str());
       exit(1);
     }
+    options.max_successive_merges = FLAGS_max_successive_merges;
 
     // set universal style compaction configurations, if applicable
     if (FLAGS_universal_size_ratio != 0) {
@@ -2381,13 +2407,16 @@ class Benchmark {
   //
   // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
   // to simulate random additions over 64-bit integers using merge.
+  //
+  // The number of merges on the same key can be controlled by adjusting
+  // FLAGS_merge_keys.
   void MergeRandom(ThreadState* thread) {
     RandomGenerator gen;
 
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
-      const long long k = thread->rand.Next() % FLAGS_num;
+      const long long k = thread->rand.Next() % merge_keys_;
       unique_ptr<char []> key = GenerateKeyFromInt(k);
 
       Status s = db_->Merge(write_options_, key.get(),
@@ -2406,6 +2435,68 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  // Read and merge random keys. The amount of reads and merges are controlled
+  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
+  // keys (and thus also the number of reads and merges on the same key) can be
+  // adjusted with FLAGS_merge_keys.
+  //
+  // As with MergeRandom, the merge operator to use should be defined by
+  // FLAGS_merge_operator.
+  void ReadRandomMergeRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string value;
+    long long num_hits = 0;
+    long long num_gets = 0;
+    long long num_merges = 0;
+    size_t max_length = 0;
+
+    // the number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+
+    while (!duration.Done(1)) {
+      const long long k = thread->rand.Next() % merge_keys_;
+      unique_ptr<char []> key = GenerateKeyFromInt(k);
+
+      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
+
+      if (do_merge) {
+        Status s = db_->Merge(write_options_, key.get(),
+                              gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+
+        num_merges++;
+
+      } else {
+        Status s = db_->Get(options, key.get(), &value);
+        if (value.length() > max_length)
+          max_length = value.length();
+
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          num_hits++;
+        }
+
+        num_gets++;
+
+      }
+
+      thread->stats.FinishedSingleOp(db_);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "(reads:%lld merges:%lld total:%lld hits:%lld maxlength:%zu)",
+             num_gets, num_merges, readwrites_, num_hits, max_length);
+    thread->stats.AddMessage(msg);
+  }
+
+
   void Compact(ThreadState* thread) {
     db_->CompactRange(nullptr, nullptr);
   }
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a7232246a3f3c76dc3f4d23f505dbb6e34c0527e..04d6d0e17a1116fec2c48e7df0401646b039252a 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   // Make a set of all of the live *.sst files
   std::set<uint64_t> live;
-  versions_->AddLiveFilesCurrentVersion(&live);
+  versions_->current()->AddLiveFiles(&live);
 
   ret.clear();
   ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3e470389bd14a90d8a3184f681a693b2e7ae831a..57565996add679996be989075bd16981fa3d35b6 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -56,6 +56,7 @@
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -228,12 +229,34 @@ CompressionType GetCompressionType(const Options& options, int level,
   }
 }
 
+CompressionType GetCompressionFlush(const Options& options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+
+  bool can_compress;
+
+  if  (options.compaction_style == kCompactionStyleUniversal) {
+    can_compress =
+        (options.compaction_options_universal.compression_size_percent < 0);
+  } else {
+    // For leveled compress when min_level_to_compress == 0.
+    can_compress = (GetCompressionType(options, 0, true) != kNoCompression);
+  }
+
+  if (can_compress) {
+    return options.compression;
+  } else {
+    return kNoCompression;
+  }
+}
+
 DBImpl::DBImpl(const Options& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
       internal_comparator_(options.comparator),
-      options_(SanitizeOptions(
-          dbname, &internal_comparator_, &internal_filter_policy_, options)),
+      options_(SanitizeOptions(dbname, &internal_comparator_,
+                               &internal_filter_policy_, options)),
       internal_filter_policy_(options.filter_policy),
       owns_info_log_(options_.info_log != options.info_log),
       db_lock_(nullptr),
@@ -241,8 +264,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       shutting_down_(nullptr),
       bg_cv_(&mutex_),
       mem_rep_factory_(options_.memtable_factory.get()),
-      mem_(new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_)),
+      mem_(new MemTable(internal_comparator_, options_)),
       logfile_number_(0),
       super_version_(nullptr),
       tmp_batch_(),
@@ -390,7 +412,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
 }
 
 Status DBImpl::NewDB() {
-  VersionEdit new_db(NumberLevels());
+  VersionEdit new_db;
   new_db.SetComparatorName(user_comparator()->Name());
   new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
@@ -857,10 +879,8 @@ void DBImpl::PurgeObsoleteWALFiles() {
 
 // If externalTable is set, then apply recovered transactions
 // to that table. This is used for readonly mode.
-Status DBImpl::Recover(
-    VersionEdit* edit,
-    MemTable* external_table,
-    bool error_if_log_file_exist) {
+Status DBImpl::Recover(VersionEdit* edit, MemTable* external_table,
+                       bool error_if_log_file_exist) {
   mutex_.AssertHeld();
 
   assert(db_lock_ == nullptr);
@@ -1028,8 +1048,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
     WriteBatchInternal::SetContents(&batch, record);
 
     if (mem == nullptr) {
-      mem = new MemTable(internal_comparator_, mem_rep_factory_,
-        NumberLevels(), options_);
+      mem = new MemTable(internal_comparator_, options_);
       mem->Ref();
     }
     status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
@@ -1088,7 +1107,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
     s = BuildTable(dbname_, env_, options_, storage_options_,
                    table_cache_.get(), iter, &meta,
                    user_comparator(), newest_snapshot,
-                   earliest_seqno_in_memtable, true);
+                   earliest_seqno_in_memtable,
+                   GetCompressionFlush(options_));
     LogFlush(options_.info_log);
     mutex_.Lock();
   }
@@ -1149,15 +1169,11 @@ Status DBImpl::WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit,
     Log(options_.info_log,
         "Level-0 flush table #%lu: started",
         (unsigned long)meta.number);
-    // We skip compression if universal compression is used and the size
-    // threshold is set for compression.
-    bool enable_compression = (options_.compaction_style
-        != kCompactionStyleUniversal ||
-        options_.compaction_options_universal.compression_size_percent < 0);
+
     s = BuildTable(dbname_, env_, options_, storage_options_,
                    table_cache_.get(), iter, &meta,
                    user_comparator(), newest_snapshot,
-                   earliest_seqno_in_memtable, enable_compression);
+                   earliest_seqno_in_memtable, GetCompressionFlush(options_));
     LogFlush(options_.info_log);
     delete iter;
     Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s",
@@ -1279,8 +1295,11 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
   return s;
 }
 
-void DBImpl::CompactRange(const Slice* begin, const Slice* end,
-                          bool reduce_level, int target_level) {
+void DBImpl::CompactRange(const Slice* begin,
+                          const Slice* end,
+                          bool reduce_level,
+                          int target_level) {
+  FlushMemTable(FlushOptions());
   int max_level_with_files = 1;
   {
     MutexLock l(&mutex_);
@@ -1291,9 +1310,15 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end,
       }
     }
   }
-  TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap
-  for (int level = 0; level < max_level_with_files; level++) {
-    TEST_CompactRange(level, begin, end);
+  for (int level = 0; level <= max_level_with_files; level++) {
+    // in case the compaction is unversal or if we're compacting the
+    // bottom-most level, the output level will be the same as input one
+    if (options_.compaction_style == kCompactionStyleUniversal ||
+        level == max_level_with_files) {
+      RunManualCompaction(level, level, begin, end);
+    } else {
+      RunManualCompaction(level, level + 1, begin, end);
+    }
   }
 
   if (reduce_level) {
@@ -1305,13 +1330,13 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end,
 // return the same level if it cannot be moved
 int DBImpl::FindMinimumEmptyLevelFitting(int level) {
   mutex_.AssertHeld();
+  Version* current = versions_->current();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (versions_->NumLevelFiles(i) > 0) break;
-
+    if (current->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
-    if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break;
+    if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break;
 
     minimum_level = i;
   }
@@ -1356,7 +1381,7 @@ void DBImpl::ReFitLevel(int level, int target_level) {
     Log(options_.info_log, "Before refitting:\n%s",
         versions_->current()->DebugString().data());
 
-    VersionEdit edit(NumberLevels());
+    VersionEdit edit;
     for (const auto& f : versions_->current()->files_[level]) {
       edit.DeleteFile(level, f->number);
       edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
@@ -1590,13 +1615,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path,
   return status;
 }
 
-void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
-  assert(level >= 0);
+void DBImpl::RunManualCompaction(int input_level,
+                                 int output_level,
+                                 const Slice* begin,
+                                 const Slice* end) {
+  assert(input_level >= 0);
 
   InternalKey begin_storage, end_storage;
 
   ManualCompaction manual;
-  manual.level = level;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
   manual.done = false;
   manual.in_progress = false;
   // For universal compaction, we enforce every manual compaction to compact
@@ -1624,11 +1653,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   // can compact any range of keys/files.
   //
   // bg_manual_only_ is non-zero when at least one thread is inside
-  // TEST_CompactRange(), i.e. during that time no other compaction will
+  // RunManualCompaction(), i.e. during that time no other compaction will
   // get scheduled (see MaybeScheduleFlushOrCompaction).
   //
   // Note that the following loop doesn't stop more that one thread calling
-  // TEST_CompactRange() from getting to the second while loop below.
+  // RunManualCompaction() from getting to the second while loop below.
   // However, only one of them will actually schedule compaction, while
   // others will wait on a condition variable until it completes.
 
@@ -1658,6 +1687,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   --bg_manual_only_;
 }
 
+void DBImpl::TEST_CompactRange(int level,
+                               const Slice* begin,
+                               const Slice* end) {
+  int output_level = (options_.compaction_style == kCompactionStyleUniversal)
+                         ? level
+                         : level + 1;
+  RunManualCompaction(level, output_level, begin, end);
+}
+
 Status DBImpl::FlushMemTable(const FlushOptions& options) {
   // nullptr batch means just wait for earlier writes to be done
   Status s = Write(WriteOptions(), nullptr);
@@ -1803,6 +1841,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
   PurgeObsoleteWALFiles();
 }
 
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return versions_->current()->NumLevelBytes(0);
+}
+
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
   DeletionState deletion_state(true);
@@ -1877,23 +1920,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   unique_ptr<Compaction> c;
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
-  InternalKey manual_end;
+  InternalKey manual_end_storage;
+  InternalKey* manual_end = &manual_end_storage;
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
     assert(!m->in_progress);
     m->in_progress = true; // another thread cannot pick up the same work
-    c.reset(versions_->CompactRange(m->level, m->begin, m->end));
-    if (c) {
-      manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
-    } else {
+    c.reset(versions_->CompactRange(
+        m->input_level, m->output_level, m->begin, m->end, &manual_end));
+    if (!c) {
       m->done = true;
     }
     Log(options_.info_log,
-        "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
-        m->level,
+        "Manual compaction from level-%d to level-%d from %s .. %s; will stop "
+        "at %s\n",
+        m->input_level,
+        m->output_level,
         (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
         (m->end ? m->end->DebugString().c_str() : "(end)"),
-        (m->done ? "(end)" : manual_end.DebugString().c_str()));
+        ((m->done || manual_end == nullptr)
+             ? "(end)"
+             : manual_end->DebugString().c_str()));
   } else if (!options_.disable_auto_compactions) {
     c.reset(versions_->PickCompaction());
   }
@@ -1912,13 +1959,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
                        f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->edit(), &mutex_);
     InstallSuperVersion(deletion_state);
-    VersionSet::LevelSummaryStorage tmp;
+    Version::LevelSummaryStorage tmp;
     Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
-        static_cast<unsigned long long>(f->number),
-        c->level() + 1,
+        static_cast<unsigned long long>(f->number), c->level() + 1,
         static_cast<unsigned long long>(f->file_size),
-        status.ToString().c_str(),
-        versions_->LevelSummary(&tmp));
+        status.ToString().c_str(), versions_->current()->LevelSummary(&tmp));
     versions_->ReleaseCompactionFiles(c.get(), status);
     *madeProgress = true;
   } else {
@@ -1958,13 +2003,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     //   Also note that, if we don't stop here, then the current compaction
     //   writes a new file back to level 0, which will be used in successive
     //   compaction. Hence the manual compaction will never finish.
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (manual_end == nullptr) {
       m->done = true;
     }
     if (!m->done) {
       // We only compacted part of the requested range.  Update *m
       // to the range that is left to be compacted.
-      m->tmp_storage = manual_end;
+      // Universal compaction should always compact the whole range
+      assert(options_.compaction_style != kCompactionStyleUniversal);
+      m->tmp_storage = *manual_end;
       m->begin = &m->tmp_storage;
     }
     m->in_progress = false; // not being processed anymore
@@ -1996,14 +2047,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
 }
 
 // Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1.
+// many output file numbers as there are files in level+1 (at least one)
 // Insert them into pending_outputs so that they do not get deleted.
 void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
   mutex_.AssertHeld();
   assert(compact != nullptr);
   assert(compact->builder == nullptr);
   int filesNeeded = compact->compaction->num_input_files(1);
-  for (int i = 0; i < filesNeeded; i++) {
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
     uint64_t file_number = versions_->NewFileNumber();
     pending_outputs_.insert(file_number);
     compact->allocated_file_numbers.push_back(file_number);
@@ -2147,14 +2198,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
 
   // Add compaction outputs
   compact->compaction->AddInputDeletions(compact->compaction->edit());
-  const int level = compact->compaction->level();
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
     compact->compaction->edit()->AddFile(
-        (options_.compaction_style == kCompactionStyleUniversal) ?
-          level : level + 1,
-        out.number, out.file_size, out.smallest, out.largest,
-        out.smallest_seqno, out.largest_seqno);
+        compact->compaction->output_level(), out.number, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
   }
   return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
 }
@@ -2196,14 +2244,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       compact->compaction->num_input_files(0),
       compact->compaction->level(),
       compact->compaction->num_input_files(1),
-      compact->compaction->level() + 1,
+      compact->compaction->output_level(),
       compact->compaction->score(),
       options_.max_background_compactions - bg_compaction_scheduled_);
   char scratch[256];
   compact->compaction->Summary(scratch, sizeof(scratch));
   Log(options_.info_log, "Compaction start summary: %s\n", scratch);
 
-  assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0);
   assert(compact->builder == nullptr);
   assert(!compact->outfile);
 
@@ -2575,22 +2623,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     status = InstallCompactionResults(compact);
     InstallSuperVersion(deletion_state);
   }
-  VersionSet::LevelSummaryStorage tmp;
+  Version::LevelSummaryStorage tmp;
   Log(options_.info_log,
       "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
       "write-amplify(%.1f) %s\n",
-      versions_->LevelSummary(&tmp),
+      versions_->current()->LevelSummary(&tmp),
       (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
-          (double) stats.micros,
-      compact->compaction->output_level(),
-      stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1,
-      stats.bytes_readn / 1048576.0,
-      stats.bytes_readnp1 / 1048576.0,
+          (double)stats.micros,
+      compact->compaction->output_level(), stats.files_in_leveln,
+      stats.files_in_levelnp1, stats.files_out_levelnp1,
+      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
       stats.bytes_written / 1048576.0,
       (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-          (double) stats.bytes_readn,
-      stats.bytes_written / (double) stats.bytes_readn,
+          (double)stats.bytes_readn,
+      stats.bytes_written / (double)stats.bytes_readn,
       status.ToString().c_str());
 
   return status;
@@ -2658,6 +2705,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   Iterator* internal_iter = NewMergingIterator(
       env_, &internal_comparator_, memtables.data(), memtables.size()
   );
+
   cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
@@ -2673,7 +2721,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
   MutexLock l(&mutex_);
-  return versions_->MaxNextLevelOverlappingBytes();
+  return versions_->current()->MaxNextLevelOverlappingBytes();
 }
 
 Status DBImpl::Get(const ReadOptions& options,
@@ -2992,12 +3040,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   uint64_t last_sequence = versions_->LastSequence();
   Writer* last_writer = &w;
   if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
-    // TODO: BuildBatchGroup physically concatenate/copy all write batches into
-    // a new one. Mem copy is done with the lock held. Ideally, we only need
-    // the lock to obtain the last_writer and the references to all batches.
-    // Creation (copy) of the merged batch could have been done outside of the
-    // lock protected region.
-    WriteBatch* updates = BuildBatchGroup(&last_writer);
+    autovector<WriteBatch*> write_batch_group;
+    BuildBatchGroup(&last_writer, &write_batch_group);
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
@@ -3005,6 +3049,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     // into mem_.
     {
       mutex_.Unlock();
+      WriteBatch* updates = nullptr;
+      if (write_batch_group.size() == 1) {
+        updates = write_batch_group[0];
+      } else {
+        updates = &tmp_batch_;
+        for (size_t i = 0; i < write_batch_group.size(); ++i) {
+          WriteBatchInternal::Append(updates, write_batch_group[i]);
+        }
+      }
+
       const SequenceNumber current_sequence = last_sequence + 1;
       WriteBatchInternal::SetSequence(updates, current_sequence);
       int my_batch_count = WriteBatchInternal::Count(updates);
@@ -3056,12 +3110,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
                        SEQUENCE_NUMBER, last_sequence);
       }
       StartPerfTimer(&pre_post_process_timer);
+      if (updates == &tmp_batch_) tmp_batch_.Clear();
       mutex_.Lock();
       if (status.ok()) {
         versions_->SetLastSequence(last_sequence);
       }
     }
-    if (updates == &tmp_batch_) tmp_batch_.Clear();
   }
   if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
@@ -3091,13 +3145,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
 // REQUIRES: Writer list must be non-empty
 // REQUIRES: First writer must have a non-nullptr batch
-WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+void DBImpl::BuildBatchGroup(Writer** last_writer,
+                             autovector<WriteBatch*>* write_batch_group) {
   assert(!writers_.empty());
   Writer* first = writers_.front();
-  WriteBatch* result = first->batch;
-  assert(result != nullptr);
+  assert(first->batch != nullptr);
 
   size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
 
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
@@ -3130,18 +3185,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
         break;
       }
 
-      // Append to *reuslt
-      if (result == first->batch) {
-        // Switch to temporary batch instead of disturbing caller's batch
-        result = &tmp_batch_;
-        assert(WriteBatchInternal::Count(result) == 0);
-        WriteBatchInternal::Append(result, first->batch);
-      }
-      WriteBatchInternal::Append(result, w->batch);
+      write_batch_group->push_back(w->batch);
     }
     *last_writer = w;
   }
-  return result;
 }
 
 // This function computes the amount of time in microseconds by which a write
@@ -3195,10 +3242,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       // Yield previous error
       s = bg_error_;
       break;
-    } else if (
-        allow_delay &&
-        versions_->NumLevelFiles(0) >=
-          options_.level0_slowdown_writes_trigger) {
+    } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
@@ -3210,7 +3254,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
       {
         StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
         env_->SleepForMicroseconds(
-          SlowdownAmount(versions_->NumLevelFiles(0),
+          SlowdownAmount(versions_->current()->NumLevelFiles(0),
                          options_.level0_slowdown_writes_trigger,
                          options_.level0_stop_writes_trigger)
         );
@@ -3245,7 +3289,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
                  STALL_MEMTABLE_COMPACTION_MICROS, stall);
       stall_memtable_compaction_ += stall;
       stall_memtable_compaction_count_++;
-    } else if (versions_->NumLevelFiles(0) >=
+    } else if (versions_->current()->NumLevelFiles(0) >=
                options_.level0_stop_writes_trigger) {
       // There are too many level-0 files.
       DelayLoggingAndReset();
@@ -3321,17 +3365,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
         EnvOptions soptions(storage_options_);
         soptions.use_mmap_writes = false;
         DelayLoggingAndReset();
-        s = env_->NewWritableFile(
-            LogFileName(options_.wal_dir, new_log_number),
-            &lfile,
-            soptions
-          );
+        s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
+                                  &lfile, soptions);
         if (s.ok()) {
           // Our final size should be less than write_buffer_size
           // (compression, etc) but err on the side of caution.
           lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
-          new_mem = new MemTable(
-            internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
+          new_mem = new MemTable(internal_comparator_, options_);
           new_superversion = new SuperVersion();
         }
       }
@@ -3379,6 +3419,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
   value->clear();
 
   MutexLock l(&mutex_);
+  Version* current = versions_->current();
   Slice in = property;
   Slice prefix("rocksdb.");
   if (!in.starts_with(prefix)) return false;
@@ -3393,7 +3434,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
     } else {
       char buf[100];
       snprintf(buf, sizeof(buf), "%d",
-               versions_->NumLevelFiles(static_cast<int>(level)));
+               current->NumLevelFiles(static_cast<int>(level)));
       *value = buf;
       return true;
     }
@@ -3408,8 +3449,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
       snprintf(buf, sizeof(buf),
                "%3d %8d %8.0f\n",
                level,
-               versions_->NumLevelFiles(level),
-               versions_->NumLevelBytes(level) / 1048576.0);
+               current->NumLevelFiles(level),
+               current->NumLevelBytes(level) / 1048576.0);
       value->append(buf);
     }
     return true;
@@ -3452,8 +3493,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
              "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"
              );
     value->append(buf);
-    for (int level = 0; level < NumberLevels(); level++) {
-      int files = versions_->NumLevelFiles(level);
+    for (int level = 0; level < current->NumberLevels(); level++) {
+      int files = current->NumLevelFiles(level);
       if (stats_[level].micros > 0 || files > 0) {
         int64_t bytes_read = stats_[level].bytes_readn +
                              stats_[level].bytes_readnp1;
@@ -3474,8 +3515,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
             "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n",
             level,
             files,
-            versions_->NumLevelBytes(level) / 1048576.0,
-            versions_->NumLevelBytes(level) /
+            current->NumLevelBytes(level) / 1048576.0,
+            current->NumLevelBytes(level) /
                 versions_->MaxBytesForLevel(level),
             stats_[level].micros / 1e6,
             bytes_read / 1048576.0,
@@ -3712,7 +3753,7 @@ Status DBImpl::DeleteFile(std::string name) {
   int level;
   FileMetaData* metadata;
   int maxlevel = NumberLevels();
-  VersionEdit edit(maxlevel);
+  VersionEdit edit;
   DeletionState deletion_state(true);
   {
     MutexLock l(&mutex_);
@@ -3735,7 +3776,7 @@ Status DBImpl::DeleteFile(std::string name) {
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
     for (int i = level + 1; i < maxlevel; i++) {
-      if (versions_->NumLevelFiles(i) != 0) {
+      if (versions_->current()->NumLevelFiles(i) != 0) {
         Log(options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
@@ -3834,19 +3875,19 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     return s;
   }
   impl->mutex_.Lock();  // DBImpl::Recover() requires lock being held
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
   if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     soptions.use_mmap_writes = false;
-    s = options.env->NewWritableFile(
+    s = impl->options_.env->NewWritableFile(
       LogFileName(impl->options_.wal_dir, new_log_number),
       &lfile,
       soptions
     );
     if (s.ok()) {
-      lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size);
+      lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
       edit.SetLogNumber(new_log_number);
       impl->logfile_number_ = new_log_number;
       impl->log_.reset(new log::Writer(std::move(lfile)));
@@ -3860,12 +3901,11 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
       impl->MaybeScheduleLogDBDeployStats();
     }
   }
-  impl->mutex_.Unlock();
 
-  if (options.compaction_style == kCompactionStyleUniversal) {
-    int num_files;
+  if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) {
+    Version* current = impl->versions_->current();
     for (int i = 1; i < impl->NumberLevels(); i++) {
-      num_files = impl->versions_->NumLevelFiles(i);
+      int num_files = current->NumLevelFiles(i);
       if (num_files > 0) {
         s = Status::InvalidArgument("Not all files are at level 0. Cannot "
           "open with universal compaction style.");
@@ -3874,6 +3914,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
     }
   }
 
+  impl->mutex_.Unlock();
+
   if (s.ok()) {
     *dbptr = impl;
   } else {
diff --git a/db/db_impl.h b/db/db_impl.h
index a5e4d42bcd27b9a2d4be3989e49e2ea0809e073d..c43f720b698b06544db2afd1d31226c69bea4a3f 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -91,10 +91,17 @@ class DBImpl : public DB {
 
   virtual Status GetDbIdentity(std::string& identity);
 
+  void RunManualCompaction(int input_level,
+                           int output_level,
+                           const Slice* begin,
+                           const Slice* end);
+
   // Extra methods (for testing) that are not in the public DB interface
 
   // Compact any files in the named level that overlap [*begin, *end]
-  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+  void TEST_CompactRange(int level,
+                         const Slice* begin,
+                         const Slice* end);
 
   // Force current memtable contents to be flushed.
   Status TEST_FlushMemTable();
@@ -124,7 +131,7 @@ class DBImpl : public DB {
   void TEST_PurgeObsoleteteWAL();
 
   // get total level0 file size. Only for testing.
-  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
+  uint64_t TEST_GetLevel0TotalSize();
 
   void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
   {
@@ -292,7 +299,8 @@ class DBImpl : public DB {
   // the superversion outside of mutex
   Status MakeRoomForWrite(bool force /* compact even if there is room? */,
                           SuperVersion** superversion_to_free);
-  WriteBatch* BuildBatchGroup(Writer** last_writer);
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(const FlushOptions& options);
@@ -405,7 +413,8 @@ class DBImpl : public DB {
 
   // Information for a manual compaction
   struct ManualCompaction {
-    int level;
+    int input_level;
+    int output_level;
     bool done;
     bool in_progress;           // compaction request being processed?
     const InternalKey* begin;   // nullptr means beginning of key range
@@ -590,4 +599,7 @@ extern Options SanitizeOptions(const std::string& db,
 CompressionType GetCompressionType(const Options& options, int level,
                                    const bool enable_compression);
 
+// Determine compression type for L0 file written by memtable flush.
+CompressionType GetCompressionFlush(const Options& options);
+
 }  // namespace rocksdb
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index dbb297e93a691663e2740e3ac6263a175cd8617a..04033b2fa33e3162d9e6153edc67f56ebb04ff17 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -86,7 +86,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
 
   DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
   impl->mutex_.Lock();
-  VersionEdit edit(impl->NumberLevels());
+  VersionEdit edit;
   Status s = impl->Recover(&edit, impl->GetMemTable(),
                            error_if_log_file_exist);
   impl->mutex_.Unlock();
diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc
index 91810abe3885844abf6e5177c310e5873c931bcd..db86865ca0a08bd39652f7ebedbeded3c1b2033b 100644
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -65,13 +65,14 @@ void DBImpl::LogDBDeployStats() {
 
   uint64_t file_total_size = 0;
   uint32_t file_total_num = 0;
-  for (int i = 0; i < versions_->NumberLevels(); i++) {
-    file_total_num += versions_->NumLevelFiles(i);
-    file_total_size += versions_->NumLevelBytes(i);
+  Version* current = versions_->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
+    file_total_size += current->NumLevelBytes(i);
   }
 
-  VersionSet::LevelSummaryStorage scratch;
-  const char* file_num_summary = versions_->LevelSummary(&scratch);
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
   std::string file_num_per_level(file_num_summary);
   std::string data_size_per_level(file_num_summary);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index ad55ecb1b0c5e31f39884473ac513ecc210dd637..9a5d128df10337a3749d6b4456d7cb544f1c975f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -832,6 +832,156 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
             options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
 }
 
+TEST(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2) == 0) {
+    ASSERT_OK(Put(Key(i++), value));
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopen(&options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopen(&options));
+}
+
+TEST(DBTest, Preallocation) {
+  const std::string src = dbname_ + "/alloc_test";
+  unique_ptr<WritableFile> srcfile;
+  const EnvOptions soptions;
+  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+  srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+  // No writes should mean no preallocation
+  size_t block_size, last_allocated_block;
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 0UL);
+
+  // Small write should preallocate one block
+  srcfile->Append("test");
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 1UL);
+
+  // Write an entire preallocation block, make sure we increased by two.
+  std::string buf(block_size, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 2UL);
+
+  // Write five more blocks at once, ensure we're where we need to be.
+  buf = std::string(block_size * 5, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 7UL);
+}
+
+TEST(DBTest, PutDeleteGet) {
+  do {
+    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_EQ("v2", Get("foo"));
+    ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+  } while (ChangeOptions());
+}
+
+
+TEST(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    Reopen(&options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    Put("k1", std::string(100000, 'x'));             // Fill memtable
+    Put("k2", std::string(100000, 'y'));             // Trigger compaction
+    ASSERT_EQ("v1", Get("foo"));
+    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromVersions) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("v1", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetSnapshot) {
+  do {
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(key, "v2"));
+      ASSERT_EQ("v2", Get(key));
+      ASSERT_EQ("v1", Get(key, s1));
+      dbfull()->TEST_FlushMemTable();
+      ASSERT_EQ("v2", Get(key));
+      ASSERT_EQ("v1", Get(key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetLevel0Ordering) {
+  do {
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put("bar", "b"));
+    ASSERT_OK(Put("foo", "v1"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(Put("foo", "v2"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("v2", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetOrderedByLevels) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    Compact("a", "z");
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_EQ("v2", Get("foo"));
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("v2", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetPicksCorrectFile) {
+  do {
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put("a", "va"));
+    Compact("a", "b");
+    ASSERT_OK(Put("x", "vx"));
+    Compact("x", "y");
+    ASSERT_OK(Put("f", "vf"));
+    Compact("f", "g");
+    ASSERT_EQ("va", Get("a"));
+    ASSERT_EQ("vf", Get("f"));
+    ASSERT_EQ("vx", Get("x"));
+  } while (ChangeOptions());
+}
+
 TEST(DBTest, GetEncountersEmptyLevel) {
   do {
     // Arrange for the following to happen:
@@ -3325,34 +3475,46 @@ TEST(DBTest, ManualCompaction) {
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
-  MakeTables(3, "p", "q");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range falls before files
-  Compact("", "c");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+    // Compaction range falls before files
+    Compact("", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range falls after files
-  Compact("r", "z");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+    // Compaction range falls after files
+    Compact("r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel());
 
-  // Compaction range overlaps files
-  Compact("p1", "p9");
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+    // Compaction range overlaps files
+    Compact("p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel());
 
-  // Populate a different range
-  MakeTables(3, "c", "e");
-  ASSERT_EQ("1,1,2", FilesPerLevel());
+    // Populate a different range
+    MakeTables(3, "c", "e");
+    ASSERT_EQ("1,1,2", FilesPerLevel());
 
-  // Compact just the new range
-  Compact("b", "f");
-  ASSERT_EQ("0,0,2", FilesPerLevel());
+    // Compact just the new range
+    Compact("b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel());
+
+    // Compact all
+    MakeTables(1, "a", "z");
+    ASSERT_EQ("0,1,2", FilesPerLevel());
+    db_->CompactRange(nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel());
+
+    if (iter == 0) {
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(&options);
+    }
+  }
 
-  // Compact all
-  MakeTables(1, "a", "z");
-  ASSERT_EQ("0,1,2", FilesPerLevel());
-  db_->CompactRange(nullptr, nullptr);
-  ASSERT_EQ("0,0,1", FilesPerLevel());
 }
 
 TEST(DBTest, DBOpen_Options) {
@@ -3413,7 +3575,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
   opts.create_if_missing = false;
   opts.num_levels = 2;
   s = DB::Open(opts, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db == nullptr);
 }
 
@@ -4348,6 +4510,70 @@ TEST(DBTest, MultiThreaded) {
   } while (ChangeOptions());
 }
 
+// Group commit test:
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST(DBTest, GroupCommitTest) {
+  do {
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+  } while (ChangeOptions());
+}
+
 namespace {
 typedef std::map<std::string, std::string> KVMap;
 }
@@ -4892,7 +5118,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   EnvOptions sopt;
   VersionSet vset(dbname, &options, sopt, nullptr, &cmp);
   ASSERT_OK(vset.Recover());
-  VersionEdit vbase(vset.NumberLevels());
+  VersionEdit vbase;
   uint64_t fnum = 1;
   for (int i = 0; i < num_base_files; i++) {
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
@@ -4904,7 +5130,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
   uint64_t start_micros = env->NowMicros();
 
   for (int i = 0; i < iters; i++) {
-    VersionEdit vedit(vset.NumberLevels());
+    VersionEdit vedit;
     vedit.DeleteFile(2, fnum);
     InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
     InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
diff --git a/db/memtable.cc b/db/memtable.cc
index c556412ea5f151b4901145be8c20d0d6935ecddd..430c3589bc2a37d94c7557cab09c1272f4567508 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -35,27 +35,22 @@ struct hash<rocksdb::Slice> {
 
 namespace rocksdb {
 
-MemTable::MemTable(const InternalKeyComparator& cmp,
-                   MemTableRepFactory* table_factory,
-                   int numlevel,
-                   const Options& options)
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
     : comparator_(cmp),
       refs_(0),
       arena_impl_(options.arena_block_size),
-      table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
+      table_(options.memtable_factory->CreateMemTableRep(comparator_,
+                                                         &arena_impl_)),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
-      edit_(numlevel),
       first_seqno_(0),
       mem_next_logfile_number_(0),
       mem_logfile_number_(0),
-      locks_(options.inplace_update_support
-             ? options.inplace_update_num_locks
-             : 0),
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0),
       prefix_extractor_(options.prefix_extractor) {
-
-  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0)  {
+  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
     prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
                                          options.memtable_prefix_bloom_probes));
   }
@@ -67,7 +62,7 @@ MemTable::~MemTable() {
 
 size_t MemTable::ApproximateMemoryUsage() {
   return arena_impl_.ApproximateMemoryUsage() +
-    table_->ApproximateMemoryUsage();
+         table_->ApproximateMemoryUsage();
 }
 
 int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
@@ -98,12 +93,11 @@ class MemTableIterator: public Iterator {
   MemTableIterator(const MemTable& mem, const ReadOptions& options)
       : mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) {
     if (options.prefix) {
-      iter_ = mem_.table_->GetPrefixIterator(*options.prefix);
+      iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix));
     } else if (options.prefix_seek) {
-      dynamic_prefix_seek_ = true;
-      iter_ = mem_.table_->GetDynamicPrefixIterator();
+      iter_.reset(mem_.table_->GetDynamicPrefixIterator());
     } else {
-      iter_ = mem_.table_->GetIterator();
+      iter_.reset(mem_.table_->GetIterator());
     }
   }
 
@@ -212,12 +206,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   Slice mem_key = key.memtable_key();
   Slice user_key = key.user_key();
 
-  std::shared_ptr<MemTableRep::Iterator> iter;
+  std::unique_ptr<MemTableRep::Iterator> iter;
   if (prefix_bloom_ &&
       !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
     // iter is null if prefix bloom says the key does not exist
   } else {
-    iter = table_->GetIterator(user_key);
+    iter.reset(table_->GetIterator(user_key));
     iter->Seek(user_key, mem_key.data());
   }
 
@@ -328,11 +322,10 @@ void MemTable::Update(SequenceNumber seq,
   LookupKey lkey(key, seq);
   Slice mem_key = lkey.memtable_key();
 
-  std::shared_ptr<MemTableRep::Iterator> iter(
+  std::unique_ptr<MemTableRep::Iterator> iter(
     table_->GetIterator(lkey.user_key()));
   iter->Seek(key, mem_key.data());
 
-
   if (iter->Valid()) {
     // entry format is:
     //    key_length  varint32
@@ -462,4 +455,37 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
   // or key doesn't exist
   return false;
 }
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
+  iter->Seek(key.user_key(), memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (!comparator_.comparator.user_comparator()->Compare(
+        Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
 }  // namespace rocksdb
diff --git a/db/memtable.h b/db/memtable.h
index c94bf0b0b35a02b3b3648cf8ea7eb2bc8a3fb686..5e7eeb4a1809379c10e63edd2ed923fdb8b24278 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -35,11 +35,8 @@ class MemTable {
 
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
-  explicit MemTable(
-    const InternalKeyComparator& comparator,
-    MemTableRepFactory* table_factory,
-    int numlevel = 7,
-    const Options& options = Options());
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options = Options());
 
   ~MemTable();
 
@@ -123,6 +120,11 @@ class MemTable {
                       const Slice& delta,
                       const Options& options);
 
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
   // Returns the edits area that is needed for flushing the memtable
   VersionEdit* GetEdits() { return &edit_; }
 
@@ -157,7 +159,7 @@ class MemTable {
   KeyComparator comparator_;
   int refs_;
   ArenaImpl arena_impl_;
-  shared_ptr<MemTableRep> table_;
+  unique_ptr<MemTableRep> table_;
 
   // These are used to manage memtable flushes to storage
   bool flush_in_progress_; // started the flush
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 0c14aff2ce9ab9d3007421ba166164157ddc8d4e..887d8ad425f1e915299e22b151a146f6684fc98c 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -14,6 +14,7 @@
 #include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
+#include "db/write_batch_internal.h"
 #include "utilities/merge_operators.h"
 #include "util/testharness.h"
 #include "utilities/utility_db.h"
@@ -21,13 +22,52 @@
 using namespace std;
 using namespace rocksdb;
 
+namespace {
+  int numMergeOperatorCalls;
 
-std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false) {
+  void resetNumMergeOperatorCalls() {
+    numMergeOperatorCalls = 0;
+  }
+}
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    ++numMergeOperatorCalls;
+    return mergeOperator_->PartialMerge(
+        key,
+        *existing_value,
+        value,
+        new_value,
+        logger);
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+std::shared_ptr<DB> OpenDb(
+    const string& dbname,
+    const bool ttl = false,
+    const unsigned max_successive_merges = 0) {
   DB* db;
   StackableDB* sdb;
   Options options;
   options.create_if_missing = true;
-  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
   Status s;
   DestroyDB(dbname, Options());
   if (ttl) {
@@ -243,6 +283,67 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
 }
 
+void testSuccessiveMerge(
+    Counters& counters, int max_num_merges, int num_merges) {
+
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (int i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      assert(numMergeOperatorCalls == max_num_merges + 1);
+    } else {
+      assert(numMergeOperatorCalls == 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    assert(counters.assert_get("z") == sum);
+    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+  }
+}
+
+void testSingleBatchSuccessiveMerge(
+    DB* db,
+    int max_num_merges,
+    int num_merges) {
+  assert(num_merges > max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (int i = 0; i < num_merges; ++i) {
+    batch.Merge(key, merge_value_slice);
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  {
+    Status s = db->Write(WriteOptions(), &batch);
+    assert(s.ok());
+  }
+  assert(numMergeOperatorCalls ==
+      num_merges - (num_merges % (max_num_merges + 1)));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  string get_value_str;
+  {
+    Status s = db->Get(ReadOptions(), key, &get_value_str);
+    assert(s.ok());
+  }
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+}
+
 void runTest(int argc, const string& dbname, const bool use_ttl = false) {
   auto db = OpenDb(dbname, use_ttl);
 
@@ -265,6 +366,19 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
   }
 
   DestroyDB(dbname, Options());
+  db.reset();
+
+  {
+    cout << "Test merge in memtable... \n";
+    unsigned maxMerge = 5;
+    auto db = OpenDb(dbname, use_ttl, maxMerge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+    testSuccessiveMerge(counters, maxMerge, maxMerge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    DestroyDB(dbname, Options());
+  }
+
 }
 
 int main(int argc, char *argv[]) {
diff --git a/db/repair.cc b/db/repair.cc
index 802c04fc4f7eb3bbf5be6814a3281cdb9bdba93f..f72bca6b73fdfb762fb1067e028890999f801c6c 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -58,7 +58,7 @@ class Repairer {
         next_file_number_(1) {
     // TableCache can be small since we expect each table to be opened once.
     table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
-    edit_ = new VersionEdit(options.num_levels);
+    edit_ = new VersionEdit();
   }
 
   ~Repairer() {
@@ -196,8 +196,7 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(),
-      options_.num_levels);
+    MemTable* mem = new MemTable(icmp_, options_);
     mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
@@ -225,7 +224,8 @@ class Repairer {
     Iterator* iter = mem->NewIterator();
     status = BuildTable(dbname_, env_, options_, storage_options_,
                         table_cache_, iter, &meta,
-                        icmp_.user_comparator(), 0, 0, true);
+                        icmp_.user_comparator(), 0, 0,
+                        kNoCompression);
     delete iter;
     delete mem->Unref();
     mem = nullptr;
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 1a6a119564f26e3a106d368469e9bc4206614c83..6645998852db2435c96981ae66443c4f21d2d9c5 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -33,6 +33,7 @@ enum Tag {
 
 void VersionEdit::Clear() {
   comparator_.clear();
+  max_level_ = 0;
   log_number_ = 0;
   prev_log_number_ = 0;
   last_sequence_ = 0;
@@ -105,14 +106,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
 
 bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
   uint32_t v;
-  if (GetVarint32(input, &v) &&
-      (int)v < number_levels_) {
+  if (GetVarint32(input, &v)) {
     *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
     return true;
   } else {
-    if ((int)v >= number_levels_) {
-      *msg = "db already has more levels than options.num_levels";
-    }
     return false;
   }
 }
diff --git a/db/version_edit.h b/db/version_edit.h
index 7751ad92efbaae903c6814d6ce4b85c128fc5e49..ddba78bf26092c5222a51ca976462f954b36074a 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -41,10 +41,7 @@ struct FileMetaData {
 
 class VersionEdit {
  public:
-  explicit VersionEdit(int number_levels) :
-      number_levels_(number_levels) {
-    Clear();
-  }
+  VersionEdit() { Clear(); }
   ~VersionEdit() { }
 
   void Clear();
@@ -115,7 +112,7 @@ class VersionEdit {
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
-  int number_levels_;
+  int max_level_;
   std::string comparator_;
   uint64_t log_number_;
   uint64_t prev_log_number_;
@@ -127,9 +124,9 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
 
-  std::vector<std::pair<int, InternalKey>> compact_pointers_;
+  std::vector<std::pair<int, InternalKey> > compact_pointers_;
   DeletedFileSet deleted_files_;
-  std::vector<std::pair<int, FileMetaData>> new_files_;
+  std::vector<std::pair<int, FileMetaData> > new_files_;
 };
 
 }  // namespace rocksdb
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 4a00822f7994084bee31cbe0d7f13c2de6f06de5..63aa32e8f69f14c574f25f460357d44c8e177cdb 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -15,7 +15,7 @@ namespace rocksdb {
 static void TestEncodeDecode(const VersionEdit& edit) {
   std::string encoded, encoded2;
   edit.EncodeTo(&encoded);
-  VersionEdit parsed(7);
+  VersionEdit parsed;
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
   parsed.EncodeTo(&encoded2);
@@ -27,7 +27,7 @@ class VersionEditTest { };
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
 
-  VersionEdit edit(7);
+  VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
     edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
diff --git a/db/version_set.cc b/db/version_set.cc
index eb58e27804e6cf9907038f574cebfe302f8828af..f86de6ae7195d0869d148dabb1cb2deaaa0f5846 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -18,6 +18,7 @@
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
@@ -45,7 +46,7 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     for (size_t i = 0; i < files_[level].size(); i++) {
       FileMetaData* f = files_[level][i];
       assert(f->refs > 0);
@@ -269,7 +270,7 @@ void Version::AddIterators(const ReadOptions& options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < vset_->NumberLevels(); level++) {
+  for (int level = 1; level < num_levels_; level++) {
     if (!files_[level].empty()) {
       iters->push_back(NewConcatenatingIterator(options, soptions, level));
     }
@@ -408,17 +409,19 @@ static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
 }
 
 Version::Version(VersionSet* vset, uint64_t version_number)
-    : vset_(vset), next_(this), prev_(this), refs_(0),
-      files_(new std::vector<FileMetaData*>[vset->NumberLevels()]),
-      files_by_size_(vset->NumberLevels()),
-      next_file_to_compact_by_size_(vset->NumberLevels()),
+    : vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      num_levels_(vset->num_levels_),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      files_by_size_(num_levels_),
+      next_file_to_compact_by_size_(num_levels_),
       file_to_compact_(nullptr),
       file_to_compact_level_(-1),
-      compaction_score_(vset->NumberLevels()),
-      compaction_level_(vset->NumberLevels()),
-      offset_manifest_file_(0),
-      version_number_(version_number) {
-}
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      version_number_(version_number) {}
 
 void Version::Get(const ReadOptions& options,
                   const LookupKey& k,
@@ -457,7 +460,7 @@ void Version::Get(const ReadOptions& options,
   // levels.  Therefore we are guaranteed that if we find data
   // in an smaller level, later levels are irrelevant (unless we
   // are MergeInProgress).
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     size_t num_files = files_[level].size();
     if (num_files == 0) continue;
 
@@ -590,6 +593,159 @@ bool Version::UpdateStats(const GetStats& stats) {
   return false;
 }
 
+void Version::Finalize(std::vector<uint64_t>& size_being_compacted) {
+  // Pre-sort level0 for Get()
+  if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirstBySeqNo);
+  } else {
+    std::sort(files_[0].begin(), files_[0].end(), NewestFirst);
+  }
+
+  double max_score = 0;
+  int max_score_level = 0;
+
+  int num_levels_to_check =
+      (vset_->options_->compaction_style != kCompactionStyleUniversal)
+          ? NumberLevels() - 1
+          : 1;
+
+  for (int level = 0; level < num_levels_to_check; level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int numfiles = 0;
+      for (unsigned int i = 0; i < files_[level].size(); i++) {
+        if (!files_[level][i]->being_compacted) {
+          numfiles++;
+        }
+      }
+
+      // If we are slowing down writes, then we better compact that first
+      if (numfiles >= vset_->options_->level0_stop_writes_trigger) {
+        score = 1000000;
+        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
+      } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) {
+        score = 10000;
+        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
+      } else {
+        score = static_cast<double>(numfiles) /
+                vset_->options_->level0_file_num_compaction_trigger;
+        if (score >= 1) {
+          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes =
+          TotalFileSize(files_[level]) - size_being_compacted[level];
+      score = static_cast<double>(level_bytes) / vset_->MaxBytesForLevel(level);
+      if (score > 1) {
+        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
+      }
+      if (max_score < score) {
+        max_score = score;
+        max_score_level = level;
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // update the max compaction score in levels 1 to n-1
+  max_compaction_score_ = max_score;
+  max_compaction_score_level_ = max_score_level;
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < NumberLevels() - 2; i++) {
+    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+}
+
+namespace {
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareSizeDescending(const Version::Fsize& first,
+                           const Version::Fsize& second) {
+  return (first.file->file_size > second.file->file_size);
+}
+// A static compator used to sort files based on their seqno
+// In universal style : descending seqno
+bool CompareSeqnoDescending(const Version::Fsize& first,
+                            const Version::Fsize& second) {
+  if (first.file->smallest_seqno > second.file->smallest_seqno) {
+    assert(first.file->largest_seqno > second.file->largest_seqno);
+    return true;
+  }
+  assert(first.file->largest_seqno <= second.file->largest_seqno);
+  return false;
+}
+
+} // anonymous namespace
+
+void Version::UpdateFilesBySize() {
+  // No need to sort the highest level because it is never compacted.
+  int max_level =
+      (vset_->options_->compaction_style == kCompactionStyleUniversal)
+          ? NumberLevels()
+          : NumberLevels() - 1;
+
+  for (int level = 0; level < max_level; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    std::vector<int>& files_by_size = files_by_size_[level];
+    assert(files_by_size.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (unsigned int i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
+      int num = temp.size();
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSeqnoDescending);
+    } else {
+      int num = Version::number_of_files_to_sort_;
+      if (num > (int)temp.size()) {
+        num = temp.size();
+      }
+      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                        CompareSizeDescending);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_size_
+    for (unsigned int i = 0; i < temp.size(); i++) {
+      files_by_size.push_back(temp[i].index);
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_size_[level].size());
+  }
+}
+
 void Version::Ref() {
   ++refs_;
 }
@@ -627,7 +783,7 @@ int Version::PickLevelForMemTableOutput(
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
-      if (level + 2 >= vset_->NumberLevels()) {
+      if (level + 2 >= num_levels_) {
         level++;
         break;
       }
@@ -860,9 +1016,70 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
+int64_t Version::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
+  for (int i = 0; i < NumberLevels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
+                                      int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%lu(seq=%lu,sz=%lu,%lu) ",
+                       (unsigned long)f->number,
+                       (unsigned long)f->smallest_seqno,
+                       (unsigned long)f->file_size,
+                       (unsigned long)f->being_compacted);
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t Version::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < NumberLevels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+void Version::AddLiveFiles(std::set<uint64_t>* live) {
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (const auto& file : files) {
+      live->insert(file->number);
+    }
+  }
+}
+
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < vset_->NumberLevels(); level++) {
+  for (int level = 0; level < num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -931,20 +1148,18 @@ class VersionSet::Builder {
 
  public:
   // Initialize a builder with the files from *base and other info from *vset
-  Builder(VersionSet* vset, Version* base)
-      : vset_(vset),
-        base_(base) {
+  Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
     base_->Ref();
-    levels_ = new LevelState[vset_->NumberLevels()];
+    levels_ = new LevelState[base->NumberLevels()];
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base->NumberLevels(); level++) {
       levels_[level].added_files = new FileSet(cmp);
     }
   }
 
   ~Builder() {
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
@@ -973,7 +1188,7 @@ class VersionSet::Builder {
 
   void CheckConsistency(Version* v) {
 #ifndef NDEBUG
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       // Make sure there is no overlap in levels > 0
       if (level > 0) {
         for (uint32_t i = 1; i < v->files_[level].size(); i++) {
@@ -991,14 +1206,12 @@ class VersionSet::Builder {
 #endif
   }
 
-  void CheckConsistencyForDeletes(
-    VersionEdit* edit,
-    unsigned int number,
-    int level) {
+  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
+                                  int level) {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < edit->number_levels_; l++) {
+      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
         const std::vector<FileMetaData*>& base_files = base_->files_[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
@@ -1011,7 +1224,7 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < edit->number_levels_; l++) {
+      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1092,7 +1305,7 @@ class VersionSet::Builder {
     CheckConsistency(v);
     BySmallestKey cmp;
     cmp.internal_comparator = &vset_->icmp_;
-    for (int level = 0; level < vset_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->NumberLevels(); level++) {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
       const auto& base_files = base_->files_[level];
@@ -1151,8 +1364,7 @@ class VersionSet::Builder {
   }
 };
 
-VersionSet::VersionSet(const std::string& dbname,
-                       const Options* options,
+VersionSet::VersionSet(const std::string& dbname, const Options* options,
                        const EnvOptions& storage_options,
                        TableCache* table_cache,
                        const InternalKeyComparator* cmp)
@@ -1169,11 +1381,12 @@ VersionSet::VersionSet(const std::string& dbname,
       num_levels_(options_->num_levels),
       dummy_versions_(this),
       current_(nullptr),
+      need_slowdown_for_num_level0_files_(false),
       compactions_in_progress_(options_->num_levels),
       current_version_number_(0),
-      last_observed_manifest_size_(0),
+      manifest_file_size_(0),
       storage_options_(storage_options),
-      storage_options_compactions_(storage_options_)  {
+      storage_options_compactions_(storage_options_) {
   compact_pointer_ = new std::string[options_->num_levels];
   Init(options_->num_levels);
   AppendVersion(new Version(this, current_version_number_++));
@@ -1220,6 +1433,9 @@ void VersionSet::AppendVersion(Version* v) {
     current_->Unref();
   }
   current_ = v;
+  need_slowdown_for_num_level0_files_ =
+      (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr &&
+       v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
   v->Ref();
 
   // Append to linked list
@@ -1229,10 +1445,8 @@ void VersionSet::AppendVersion(Version* v) {
   v->next_->prev_ = v;
 }
 
-Status VersionSet::LogAndApply(
-    VersionEdit* edit,
-    port::Mutex* mu,
-    bool new_descriptor_log) {
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
+                               bool new_descriptor_log) {
   mu->AssertHeld();
 
   // queue our request
@@ -1270,7 +1484,7 @@ Status VersionSet::LogAndApply(
 
   //  No need to perform this check if a new Manifest is being created anyways.
   if (!descriptor_log_ ||
-      last_observed_manifest_size_ > options_->max_manifest_file_size) {
+      manifest_file_size_ > options_->max_manifest_file_size) {
     new_descriptor_log = true;
     manifest_file_number_ = NewFileNumber(); // Change manifest file no.
   }
@@ -1284,7 +1498,7 @@ Status VersionSet::LogAndApply(
   // because &w is ensuring that all new writes get queued.
   {
     // calculate the amount of data being compacted at every level
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
 
     mu->Unlock();
@@ -1310,8 +1524,8 @@ Status VersionSet::LogAndApply(
 
     // The calls to Finalize and UpdateFilesBySize are cpu-heavy
     // and is best called outside the mutex.
-    Finalize(v, size_being_compacted);
-    UpdateFilesBySize(v);
+    v->Finalize(size_being_compacted);
+    v->UpdateFilesBySize();
 
     // Write new record to MANIFEST log
     if (s.ok()) {
@@ -1367,14 +1581,11 @@ Status VersionSet::LogAndApply(
 
     LogFlush(options_->info_log);
     mu->Lock();
-    // cache the manifest_file_size so that it can be used to rollover in the
-    // next call to LogAndApply
-    last_observed_manifest_size_ = new_manifest_file_size;
   }
 
   // Install the new version
   if (s.ok()) {
-    v->offset_manifest_file_ = new_manifest_file_size;
+    manifest_file_size_ = new_manifest_file_size;
     AppendVersion(v);
     log_number_ = edit->log_number_;
     prev_log_number_ = edit->prev_log_number_;
@@ -1408,7 +1619,7 @@ Status VersionSet::LogAndApply(
 }
 
 void VersionSet::LogAndApplyHelper(Builder* builder, Version* v,
-  VersionEdit* edit, port::Mutex* mu) {
+                                   VersionEdit* edit, port::Mutex* mu) {
   mu->AssertHeld();
 
   if (edit->has_log_number_) {
@@ -1486,21 +1697,28 @@ Status VersionSet::Recover() {
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
-      if (s.ok()) {
-        if (edit.has_comparator_ &&
-            edit.comparator_ != icmp_.user_comparator()->Name()) {
-          s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
-                                      "does not match existing comparator " +
-                                      edit.comparator_);
-        }
+      if (!s.ok()) {
+        break;
       }
 
-      if (s.ok()) {
-        builder.Apply(&edit);
+      if (edit.max_level_ >= current_->NumberLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
       }
 
+      if (edit.has_comparator_ &&
+          edit.comparator_ != icmp_.user_comparator()->Name()) {
+        s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
+            "does not match existing comparator " +
+            edit.comparator_);
+        break;
+      }
+
+      builder.Apply(&edit);
+
       if (edit.has_log_number_) {
         log_number = edit.log_number_;
         have_log_number = true;
@@ -1545,11 +1763,11 @@ Status VersionSet::Recover() {
     builder.SaveTo(v);
 
     // Install recovered version
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
-    Finalize(v, size_being_compacted);
+    v->Finalize(size_being_compacted);
 
-    v->offset_manifest_file_ = manifest_file_size;
+    manifest_file_size_ = manifest_file_size;
     AppendVersion(v);
     manifest_file_number_ = next_file;
     next_file_number_ = next_file + 1;
@@ -1573,7 +1791,7 @@ Status VersionSet::Recover() {
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
-    bool verbose, bool hex) {
+                                bool verbose, bool hex) {
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t bytes, const Status& s) {
@@ -1607,7 +1825,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit(NumberLevels());
+      VersionEdit edit;
       s = edit.DecodeFrom(record);
       if (s.ok()) {
         if (edit.has_comparator_ &&
@@ -1677,9 +1895,9 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     builder.SaveTo(v);
 
     // Install recovered version
-    std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
+    std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
     SizeBeingCompacted(size_being_compacted);
-    Finalize(v, size_being_compacted);
+    v->Finalize(size_being_compacted);
 
     AppendVersion(v);
     manifest_file_number_ = next_file;
@@ -1707,162 +1925,11 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
   }
 }
 
-void VersionSet::Finalize(Version* v,
-  std::vector<uint64_t>& size_being_compacted) {
-  // Pre-sort level0 for Get()
-  if (options_->compaction_style == kCompactionStyleUniversal) {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
-  } else {
-    std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
-  }
-
-  double max_score = 0;
-  int max_score_level = 0;
-
-  int num_levels_to_check =
-      (options_->compaction_style != kCompactionStyleUniversal) ?
-          NumberLevels() - 1 : 1;
-
-  for (int level = 0; level < num_levels_to_check; level++) {
-
-    double score;
-    if (level == 0) {
-      // We treat level-0 specially by bounding the number of files
-      // instead of number of bytes for two reasons:
-      //
-      // (1) With larger write-buffer sizes, it is nice not to do too
-      // many level-0 compactions.
-      //
-      // (2) The files in level-0 are merged on every read and
-      // therefore we wish to avoid too many files when the individual
-      // file size is small (perhaps because of a small write-buffer
-      // setting, or very high compression ratios, or lots of
-      // overwrites/deletions).
-      int numfiles = 0;
-      for (unsigned int i = 0; i < v->files_[level].size(); i++) {
-        if (!v->files_[level][i]->being_compacted) {
-          numfiles++;
-        }
-      }
-
-      // If we are slowing down writes, then we better compact that first
-      if (numfiles >= options_->level0_stop_writes_trigger) {
-        score = 1000000;
-        // Log(options_->info_log, "XXX score l0 = 1000000000 max");
-      } else if (numfiles >= options_->level0_slowdown_writes_trigger) {
-        score = 10000;
-        // Log(options_->info_log, "XXX score l0 = 1000000 medium");
-      } else {
-        score = numfiles /
-          static_cast<double>(options_->level0_file_num_compaction_trigger);
-        if (score >= 1) {
-          // Log(options_->info_log, "XXX score l0 = %d least", (int)score);
-        }
-      }
-    } else {
-      // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes = TotalFileSize(v->files_[level]) -
-                                   size_being_compacted[level];
-      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
-      if (score > 1) {
-        // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
-      }
-      if (max_score < score) {
-        max_score = score;
-        max_score_level = level;
-      }
-    }
-    v->compaction_level_[level] = level;
-    v->compaction_score_[level] = score;
-  }
-
-  // update the max compaction score in levels 1 to n-1
-  v->max_compaction_score_ = max_score;
-  v->max_compaction_score_level_ = max_score_level;
-
-  // sort all the levels based on their score. Higher scores get listed
-  // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i <  NumberLevels()-2; i++) {
-    for (int j = i+1; j < NumberLevels()-1; j++) {
-      if (v->compaction_score_[i] < v->compaction_score_[j]) {
-        double score = v->compaction_score_[i];
-        int level = v->compaction_level_[i];
-        v->compaction_score_[i] = v->compaction_score_[j];
-        v->compaction_level_[i] = v->compaction_level_[j];
-        v->compaction_score_[j] = score;
-        v->compaction_level_[j] = level;
-      }
-    }
-  }
-}
-
-// A static compator used to sort files based on their size
-// In normal mode: descending size
-static bool compareSizeDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  return (first.file->file_size > second.file->file_size);
-}
-// A static compator used to sort files based on their seqno
-// In universal style : descending seqno
-static bool compareSeqnoDescending(const VersionSet::Fsize& first,
-  const VersionSet::Fsize& second) {
-  if (first.file->smallest_seqno > second.file->smallest_seqno) {
-    assert(first.file->largest_seqno > second.file->largest_seqno);
-    return true;
-  }
-  assert(first.file->largest_seqno <= second.file->largest_seqno);
-  return false;
-}
-
-// sort all files in level1 to level(n-1) based on file size
-void VersionSet::UpdateFilesBySize(Version* v) {
-
-  // No need to sort the highest level because it is never compacted.
-  int max_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  NumberLevels() : NumberLevels() - 1;
-
-  for (int level = 0; level < max_level; level++) {
-
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    std::vector<int>& files_by_size = v->files_by_size_[level];
-    assert(files_by_size.size() == 0);
-
-    // populate a temp vector for sorting based on size
-    std::vector<Fsize> temp(files.size());
-    for (unsigned int i = 0; i < files.size(); i++) {
-      temp[i].index = i;
-      temp[i].file = files[i];
-    }
-
-    // sort the top number_of_files_to_sort_ based on file size
-    if (options_->compaction_style == kCompactionStyleUniversal) {
-      int num = temp.size();
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSeqnoDescending);
-    } else {
-      int num = Version::number_of_files_to_sort_;
-      if (num > (int)temp.size()) {
-        num = temp.size();
-      }
-      std::partial_sort(temp.begin(),  temp.begin() + num,
-                        temp.end(), compareSizeDescending);
-    }
-    assert(temp.size() == files.size());
-
-    // initialize files_by_size_
-    for (unsigned int i = 0; i < temp.size(); i++) {
-      files_by_size.push_back(temp[i].index);
-    }
-    v->next_file_to_compact_by_size_[level] = 0;
-    assert(v->files_[level].size() == v->files_by_size_[level].size());
-  }
-}
-
 Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // Save metadata
-  VersionEdit edit(NumberLevels());
+  VersionEdit edit;
   edit.SetComparatorName(icmp_.user_comparator()->Name());
 
   // Save compaction pointers
@@ -1875,7 +1942,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   }
 
   // Save files
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < current_->NumberLevels(); level++) {
     const auto& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       const auto f = files[i];
@@ -1889,61 +1956,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   return log->AddRecord(record);
 }
 
-int VersionSet::NumLevelFiles(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return current_->files_[level].size();
-}
-
-const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%d ",
-        int(current_->files_[i].size()));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelDataSizeSummary(
-    LevelSummaryStorage* scratch) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (int i = 0; i < NumberLevels(); i++) {
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz, "%lu ",
-        (unsigned long)NumLevelBytes(i));
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
-const char* VersionSet::LevelFileSummary(
-    FileSummaryStorage* scratch, int level) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
-  for (unsigned int i = 0; i < current_->files_[level].size(); i++) {
-    FileMetaData* f = current_->files_[level][i];
-    int sz = sizeof(scratch->buffer) - len;
-    int ret = snprintf(scratch->buffer + len, sz,
-                       "#%lu(seq=%lu,sz=%lu,%lu) ",
-                       (unsigned long)f->number,
-                       (unsigned long)f->smallest_seqno,
-                       (unsigned long)f->file_size,
-                       (unsigned long)f->being_compacted);
-    if (ret < 0 || ret >= sz)
-      break;
-    len += ret;
-  }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
-  return scratch->buffer;
-}
-
 // Opens the mainfest file and reads all records
 // till it finds the record we are looking for.
 bool VersionSet::ManifestContains(const std::string& record) const {
@@ -1975,7 +1987,7 @@ bool VersionSet::ManifestContains(const std::string& record) const {
 
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
-  for (int level = 0; level < NumberLevels(); level++) {
+  for (int level = 0; level < v->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = v->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
@@ -2011,7 +2023,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   for (Version* v = dummy_versions_.next_;
        v != &dummy_versions_;
        v = v->next_) {
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       total_files += v->files_[level].size();
     }
   }
@@ -2022,7 +2034,7 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   for (Version* v = dummy_versions_.next_;
        v != &dummy_versions_;
        v = v->next_) {
-    for (int level = 0; level < NumberLevels(); level++) {
+    for (int level = 0; level < v->NumberLevels(); level++) {
       for (const auto& f : v->files_[level]) {
         live_list->push_back(f->number);
       }
@@ -2030,40 +2042,6 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
   }
 }
 
-void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
-  Version* v = current_;
-  for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    for (size_t i = 0; i < files.size(); i++) {
-      live->insert(files[i]->number);
-    }
-  }
-}
-
-int64_t VersionSet::NumLevelBytes(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  assert(current_);
-  return TotalFileSize(current_->files_[level]);
-}
-
-int64_t VersionSet::MaxNextLevelOverlappingBytes() {
-  uint64_t result = 0;
-  std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < NumberLevels() - 1; level++) {
-    for (size_t i = 0; i < current_->files_[level].size(); i++) {
-      const FileMetaData* f = current_->files_[level][i];
-      current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
-                                     &overlaps);
-      const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > result) {
-        result = sum;
-      }
-    }
-  }
-  return result;
-}
-
 // Stores the minimal range that covers all entries in inputs in
 // *smallest, *largest.
 // REQUIRES: inputs is not empty
@@ -2223,7 +2201,7 @@ void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
 // The total size of files that are currently being compacted
 // at at every level upto the penultimate level.
 void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels()-1; level++) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
     uint64_t total = 0;
     for (std::set<Compaction*>::iterator it =
          compactions_in_progress_[level].begin();
@@ -2246,8 +2224,8 @@ void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
 //
-Compaction* VersionSet::PickCompactionUniversalSizeAmp(
-    int level, double score) {
+Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level,
+                                                       double score) {
   assert (level == 0);
 
   // percentage flexibilty while reducing size amplification
@@ -2329,13 +2307,13 @@ Compaction* VersionSet::PickCompactionUniversalSizeAmp(
 
   // create a compaction request
   // We always compact all the files, so always compress.
-  Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
-                                 LLONG_MAX, NumberLevels(), false,
-                                 true);
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
   c->score_ = score;
   for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
     int index = file_by_time[loop];
-    f = current_->files_[level][index];
+    f = c->input_version_->files_[level][index];
     c->inputs_[0].push_back(f);
     Log(options_->info_log,
         "Universal: size amp picking file %lu[%d] with size %lu",
@@ -2459,14 +2437,14 @@ Compaction* VersionSet::PickCompactionUniversalReadAmp(
       }
     }
   }
-  Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
-                                 LLONG_MAX, NumberLevels(), false,
-                                 enable_compression);
+  Compaction* c =
+      new Compaction(current_, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
     int index = file_by_time[i];
-    FileMetaData* f = current_->files_[level][index];
+    FileMetaData* f = c->input_version_->files_[level][index];
     c->inputs_[0].push_back(f);
     Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
         (unsigned long)f->number,
@@ -2488,10 +2466,10 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
     Log(options_->info_log, "Universal: nothing to do\n");
     return nullptr;
   }
-  VersionSet::FileSummaryStorage tmp;
+  Version::FileSummaryStorage tmp;
   Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
       current_->files_[level].size(),
-      LevelFileSummary(&tmp, 0));
+      current_->LevelFileSummary(&tmp, 0));
 
   // Check for size amplification first.
   Compaction* c = PickCompactionUniversalSizeAmp(level, score);
@@ -2528,11 +2506,11 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
   }
 
   // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = current_->files_by_size_[level];
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
 
   // Is the earliest file part of this compaction?
   int last_index = file_by_time[file_by_time.size()-1];
-  FileMetaData* last_file = current_->files_[level][last_index];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
   if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
     c->bottommost_level_ = true;
   }
@@ -2543,9 +2521,6 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
                                       c->inputs_[0].size());
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
 
@@ -2554,7 +2529,8 @@ Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
 
   // Record whether this compaction includes all sst files.
   // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ = (c->inputs_[0].size() == current_->files_[0].size());
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
 
   return c;
 }
@@ -2571,27 +2547,28 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) {
   }
 
   assert(level >= 0);
-  assert(level+1 < NumberLevels());
-  c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
-      MaxGrandParentOverlapBytes(level), NumberLevels());
+  assert(level + 1 < current_->NumberLevels());
+  c = new Compaction(current_, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
   c->score_ = score;
 
   // Pick the largest file in this level that is not already
   // being compacted
-  std::vector<int>& file_size = current_->files_by_size_[level];
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
 
-  for (unsigned int i = current_->next_file_to_compact_by_size_[level];
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
        i < file_size.size(); i++) {
     int index = file_size[i];
-    FileMetaData* f = current_->files_[level][index];
+    FileMetaData* f = c->input_version_->files_[level][index];
 
     // check to verify files are arranged in descending size
     assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_-1) ||
-          (f->file_size >= current_->files_[level][file_size[i+1]]->file_size));
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
 
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
@@ -2627,7 +2604,7 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) {
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  current_->next_file_to_compact_by_size_[level] = nextIndex;
+  c->input_version_->next_file_to_compact_by_size_[level] = nextIndex;
 
   return c;
 }
@@ -2640,7 +2617,7 @@ Compaction* VersionSet::PickCompaction() {
   // and also in LogAndApply(), otherwise the values could be stale.
   std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
   current_->vset_->SizeBeingCompacted(size_being_compacted);
-  Finalize(current_, size_being_compacted);
+  current_->Finalize(size_being_compacted);
 
   // In universal style of compaction, compact L0 files back into L0.
   if (options_->compaction_style ==  kCompactionStyleUniversal) {
@@ -2678,11 +2655,12 @@ Compaction* VersionSet::PickCompaction() {
     if (level != 0 || compactions_in_progress_[0].empty()) {
       if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
                                   &parent_index)) {
-        c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
-                MaxGrandParentOverlapBytes(level), NumberLevels(), true);
+        c = new Compaction(current_, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
         c->inputs_[0].push_back(f);
         c->parent_index_ = parent_index;
-        current_->file_to_compact_ = nullptr;
+        c->input_version_->file_to_compact_ = nullptr;
         ExpandWhileOverlapping(c);
       }
     }
@@ -2692,9 +2670,6 @@ Compaction* VersionSet::PickCompaction() {
     return nullptr;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-
   // Two level 0 compaction won't run at the same time, so don't need to worry
   // about files on level 0 being compacted.
   if (level == 0) {
@@ -2705,7 +2680,8 @@ Compaction* VersionSet::PickCompaction() {
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
     c->inputs_[0].clear();
-    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
 
     // If we include more L0 files in the same compaction run it can
     // cause the 'smallest' and 'largest' key to get extended to a
@@ -2736,11 +2712,13 @@ Compaction* VersionSet::PickCompaction() {
 
 // Returns true if any one of the parent files are being compacted
 bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
-  const InternalKey* largest, int level, int* parent_index) {
+                                         const InternalKey* largest, int level,
+                                         int* parent_index) {
   std::vector<FileMetaData*> inputs;
+  assert(level + 1 < current_->NumberLevels());
 
-  current_->GetOverlappingInputs(level+1, smallest, largest,
-                                 &inputs, *parent_index, parent_index);
+  current_->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                 *parent_index, parent_index);
   return FilesInCompaction(inputs);
 }
 
@@ -2788,8 +2766,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
     old_size = c->inputs_[0].size();
     GetRange(c->inputs_[0], &smallest, &largest);
     c->inputs_[0].clear();
-    current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0],
-                                   hint_index, &hint_index);
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
   } while(c->inputs_[0].size() > old_size);
 
   // Get the new range
@@ -2799,7 +2777,8 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
   // compaction, then we must drop/cancel this compaction.
   int parent_index = -1;
   if (FilesInCompaction(c->inputs_[0]) ||
-      ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) {
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(&smallest, &largest, level, &parent_index))) {
     c->inputs_[0].clear();
     c->inputs_[1].clear();
     delete c;
@@ -2813,7 +2792,9 @@ void VersionSet::ExpandWhileOverlapping(Compaction* c) {
 // user-key with another file.
 void VersionSet::SetupOtherInputs(Compaction* c) {
   // If inputs are empty, then there is nothing to expand.
-  if (c->inputs_[0].empty()) {
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
     return;
   }
 
@@ -2824,8 +2805,9 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   GetRange(c->inputs_[0], &smallest, &largest);
 
   // Populate the set of next-level files (inputs_[1]) to include in compaction
-  current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1],
-                                 c->parent_index_, &c->parent_index_);
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
 
   // Get entire range covered by compaction
   InternalKey all_start, all_limit;
@@ -2838,8 +2820,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   // can happen when one user key spans multiple files.
   if (!c->inputs_[1].empty()) {
     std::vector<FileMetaData*> expanded0;
-    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0,
-                                   c->base_index_, nullptr);
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
     const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
     const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
     const uint64_t expanded0_size = TotalFileSize(expanded0);
@@ -2847,13 +2829,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
     if (expanded0.size() > c->inputs_[0].size() &&
         inputs1_size + expanded0_size < limit &&
         !FilesInCompaction(expanded0) &&
-        !current_->HasOverlappingUserKey(&expanded0, level)) {
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
       InternalKey new_start, new_limit;
       GetRange(expanded0, &new_start, &new_limit);
       std::vector<FileMetaData*> expanded1;
-      current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
-                                     &expanded1, c->parent_index_,
-                                     &c->parent_index_);
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
       if (expanded1.size() == c->inputs_[1].size() &&
           !FilesInCompaction(expanded1)) {
         Log(options_->info_log,
@@ -2880,8 +2862,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   // Compute the set of grandparent files that overlap this compaction
   // (parent == level+1; grandparent == level+2)
   if (level + 2 < NumberLevels()) {
-    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                   &c->grandparents_);
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
   }
 
   if (false) {
@@ -2899,10 +2881,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   c->edit_->SetCompactPointer(level, largest);
 }
 
-Status VersionSet::GetMetadataForFile(
-    uint64_t number,
-    int *filelevel,
-    FileMetaData **meta) {
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData** meta) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
@@ -2916,8 +2896,7 @@ Status VersionSet::GetMetadataForFile(
   return Status::NotFound("File not present in any level");
 }
 
-void VersionSet::GetLiveFilesMetaData(
-    std::vector<LiveFileMetaData> * metadata) {
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
@@ -2941,11 +2920,13 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
   obsolete_files_.clear();
 }
 
-Compaction* VersionSet::CompactRange(
-    int level,
-    const InternalKey* begin,
-    const InternalKey* end) {
+Compaction* VersionSet::CompactRange(int input_level,
+                                     int output_level,
+                                     const InternalKey* begin,
+                                     const InternalKey* end,
+                                     InternalKey** compaction_end) {
   std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
 
   // All files are 'overlapping' in universal style compaction.
   // We have to compact the entire range in one shot.
@@ -2953,7 +2934,7 @@ Compaction* VersionSet::CompactRange(
     begin = nullptr;
     end = nullptr;
   }
-  current_->GetOverlappingInputs(level, begin, end, &inputs);
+  current_->GetOverlappingInputs(input_level, begin, end, &inputs);
   if (inputs.empty()) {
     return nullptr;
   }
@@ -2962,24 +2943,24 @@ Compaction* VersionSet::CompactRange(
   // But we cannot do this for level-0 since level-0 files can overlap
   // and we must not pick one file and drop another older file if the
   // two files overlap.
-  if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(level) *
-                         options_->source_compaction_factor;
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
     uint64_t total = 0;
-    for (size_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
       uint64_t s = inputs[i]->file_size;
       total += s;
       if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
         inputs.resize(i + 1);
         break;
       }
     }
   }
-  int out_level = (options_->compaction_style == kCompactionStyleUniversal) ?
-                  level : level+1;
-
-  Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(out_level),
-    MaxGrandParentOverlapBytes(level), NumberLevels());
+  Compaction* c = new Compaction(current_, input_level, output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level));
 
   c->inputs_[0] = inputs;
   ExpandWhileOverlapping(c);
@@ -2988,10 +2969,12 @@ Compaction* VersionSet::CompactRange(
     return nullptr;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
   SetupOtherInputs(c);
 
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
   // These files that are to be manaully compacted do not trample
   // upon other files because manual compactions are processed when
   // the system has a max of 1 background compaction thread.
@@ -3002,191 +2985,4 @@ Compaction* VersionSet::CompactRange(
   return c;
 }
 
-Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
-  uint64_t max_grandparent_overlap_bytes, int number_levels,
-  bool seek_compaction, bool enable_compression)
-    : level_(level),
-      out_level_(out_level),
-      max_output_file_size_(target_file_size),
-      maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
-      input_version_(nullptr),
-      number_levels_(number_levels),
-      seek_compaction_(seek_compaction),
-      enable_compression_(enable_compression),
-      grandparent_index_(0),
-      seen_key_(false),
-      overlapped_bytes_(0),
-      base_index_(-1),
-      parent_index_(-1),
-      score_(0),
-      bottommost_level_(false),
-      is_full_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels)) {
-  edit_ = new VersionEdit(number_levels_);
-  for (int i = 0; i < number_levels_; i++) {
-    level_ptrs_[i] = 0;
-  }
-}
-
-Compaction::~Compaction() {
-  delete edit_;
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-  }
-}
-
-bool Compaction::IsTrivialMove() const {
-  // Avoid a move if there is lots of overlapping grandparent data.
-  // Otherwise, the move could create a parent file that will require
-  // a very expensive merge later on.
-  return (num_input_files(0) == 1 &&
-          num_input_files(1) == 0 &&
-          TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
-}
-
-void Compaction::AddInputDeletions(VersionEdit* edit) {
-  for (int which = 0; which < 2; which++) {
-    for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
-    }
-  }
-}
-
-bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  if (input_version_->vset_->options_->compaction_style ==
-      kCompactionStyleUniversal) {
-    return bottommost_level_;
-  }
-  // Maybe use binary search to find right entry instead of linear search?
-  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
-  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
-    for (; level_ptrs_[lvl] < files.size(); ) {
-      FileMetaData* f = files[level_ptrs_[lvl]];
-      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
-        // We've advanced far enough
-        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-          // Key falls in this file's range, so definitely not base level
-          return false;
-        }
-        break;
-      }
-      level_ptrs_[lvl]++;
-    }
-  }
-  return true;
-}
-
-bool Compaction::ShouldStopBefore(const Slice& internal_key) {
-  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
-  while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(internal_key,
-                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
-    if (seen_key_) {
-      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
-    }
-    assert(grandparent_index_ + 1 >= grandparents_.size() ||
-           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
-                         grandparents_[grandparent_index_+1]->smallest.Encode())
-                         < 0);
-    grandparent_index_++;
-  }
-  seen_key_ = true;
-
-  if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
-    // Too much overlap for current output; start new output
-    overlapped_bytes_ = 0;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Mark (or clear) each file that is being compacted
-void Compaction::MarkFilesBeingCompacted(bool value) {
-  for (int i = 0; i < 2; i++) {
-    std::vector<FileMetaData*> v = inputs_[i];
-    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
-      assert(value ? !inputs_[i][j]->being_compacted :
-                      inputs_[i][j]->being_compacted);
-      inputs_[i][j]->being_compacted = value;
-    }
-  }
-}
-
-// Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool isManual) {
-  if (input_version_->vset_->options_->compaction_style  ==
-         kCompactionStyleUniversal) {
-    // If universal compaction style is used and manual
-    // compaction is occuring, then we are guaranteed that
-    // all files will be picked in a single compaction
-    // run. We can safely set bottommost_level_ = true.
-    // If it is not manual compaction, then bottommost_level_
-    // is already set when the Compaction was created.
-    if (isManual) {
-      bottommost_level_ = true;
-    }
-    return;
-  }
-  bottommost_level_ = true;
-  int num_levels = input_version_->vset_->NumberLevels();
-  for (int i = level() + 2; i < num_levels; i++) {
-    if (input_version_->vset_->NumLevelFiles(i) > 0) {
-      bottommost_level_ = false;
-      break;
-    }
-  }
-}
-
-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-}
-
-void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(level_);
-}
-
-static void InputSummary(std::vector<FileMetaData*>& files,
-    char* output,
-    int len) {
-  int write = 0;
-  for (unsigned int i = 0; i < files.size(); i++) {
-    int sz = len - write;
-    int ret = snprintf(output + write, sz, "%lu(%lu) ",
-        (unsigned long)files.at(i)->number,
-        (unsigned long)files.at(i)->file_size);
-    if (ret < 0 || ret >= sz)
-      break;
-    write += ret;
-  }
-}
-
-void Compaction::Summary(char* output, int len) {
-  int write = snprintf(output, len,
-      "Base version %lu Base level %d, seek compaction:%d, inputs:",
-      (unsigned long)input_version_->GetVersionNumber(),
-      level_,
-      seek_compaction_);
-  if (write < 0 || write > len) {
-    return;
-  }
-
-  char level_low_summary[100];
-  InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
-  char level_up_summary[100];
-  if (inputs_[1].size()) {
-    InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
-  } else {
-    level_up_summary[0] = '\0';
-  }
-
-  snprintf(output + write, len - write, "[%s],[%s]",
-      level_low_summary, level_up_summary);
-}
-
 }  // namespace rocksdb
diff --git a/db/version_set.h b/db/version_set.h
index d5dc2cb13c87c69b04423365946516dae3473924..e059fc3afb78ec6f4f65c4178dad41c396065730 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -27,6 +27,7 @@
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
+#include "db/compaction.h"
 
 namespace rocksdb {
 
@@ -86,6 +87,11 @@ class Version {
   // REQUIRES: lock is held
   bool UpdateStats(const GetStats& stats);
 
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // Also pre-sorts level0 files for Get()
+  void Finalize(std::vector<uint64_t>& size_being_compacted);
+
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
@@ -137,15 +143,45 @@ class Version {
   int PickLevelForMemTableOutput(const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
-  int NumFiles(int level) const { return files_[level].size(); }
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
 
   // Returns the version nuber of this version
-  uint64_t GetVersionNumber() {
-    return version_number_;
-  }
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // used to sort files by size
+  struct Fsize {
+    int index;
+    FileMetaData* file;
+  };
 
  private:
   friend class Compaction;
@@ -159,10 +195,15 @@ class Version {
   bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
                       const Slice& internal_prefix, Iterator* level_iter) const;
 
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
@@ -199,9 +240,6 @@ class Version {
   double max_compaction_score_; // max score in l1 to ln-1
   int max_compaction_score_level_; // level on which max score occurs
 
-  // The offset in the manifest file where this version is stored.
-  uint64_t offset_manifest_file_;
-
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
@@ -223,10 +261,8 @@ class Version {
 
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname,
-             const Options* options,
-             const EnvOptions& storage_options,
-             TableCache* table_cache,
+  VersionSet(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, TableCache* table_cache,
              const InternalKeyComparator*);
   ~VersionSet();
 
@@ -236,7 +272,7 @@ class VersionSet {
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
   Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
-      bool new_descriptor_log = false);
+                     bool new_descriptor_log = false);
 
   // Recover the last saved descriptor from persistent storage.
   Status Recover();
@@ -252,6 +288,12 @@ class VersionSet {
   // Return the current version.
   Version* current() const { return current_; }
 
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files_;
+  }
+
   // Return the current manifest file number
   uint64_t ManifestFileNumber() const { return manifest_file_number_; }
 
@@ -267,12 +309,6 @@ class VersionSet {
     }
   }
 
-  // Return the number of Table files at the specified level.
-  int NumLevelFiles(int level) const;
-
-  // Return the combined file size of all files at the specified level.
-  int64_t NumLevelBytes(int level) const;
-
   // Return the last sequence number.
   uint64_t LastSequence() const {
     return last_sequence_.load(std::memory_order_acquire);
@@ -306,14 +342,18 @@ class VersionSet {
   // the specified level.  Returns nullptr if there is nothing in that
   // level that overlaps the specified range.  Caller should delete
   // the result.
-  Compaction* CompactRange(
-      int level,
-      const InternalKey* begin,
-      const InternalKey* end);
-
-  // Return the maximum overlapping data (in bytes) at next level for any
-  // file at a level >= 1.
-  int64_t MaxNextLevelOverlappingBytes();
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  Compaction* CompactRange(int input_level,
+                           int output_level,
+                           const InternalKey* begin,
+                           const InternalKey* end,
+                           InternalKey** compaction_end);
 
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
@@ -358,37 +398,16 @@ class VersionSet {
   // Add all files listed in any live version to *live.
   void AddLiveFiles(std::vector<uint64_t>* live_list);
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
-
   // Return the approximate offset in the database of the data for
   // "key" as of version "v".
   uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
 
-  // Return a human-readable short (single-line) summary of the number
-  // of files per level.  Uses *scratch as backing store.
-  struct LevelSummaryStorage {
-    char buffer[100];
-  };
-  struct FileSummaryStorage {
-    char buffer[1000];
-  };
-  const char* LevelSummary(LevelSummaryStorage* scratch) const;
-
   // printf contents (for debugging)
   Status DumpManifest(Options& options, std::string& manifestFileName,
                       bool verbose, bool hex = false);
 
-  // Return a human-readable short (single-line) summary of the data size
-  // of files per level.  Uses *scratch as backing store.
-  const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
-
-  // Return a human-readable short (single-line) summary of files
-  // in a specified level.  Uses *scratch as backing store.
-  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
-
   // Return the size of the current manifest file
-  const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
 
   // For the specfied level, pick a compaction.
   // Returns nullptr if there is no compaction to be done.
@@ -415,16 +434,6 @@ class VersionSet {
   // pick the same files to compact.
   bool VerifyCompactionFileConsistency(Compaction* c);
 
-  // used to sort files by size
-  typedef struct fsize {
-    int index;
-    FileMetaData* file;
-  } Fsize;
-
-  // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize(Version *v);
-
   // Get the max file size in a given level.
   uint64_t MaxFileSizeForLevel(int level);
 
@@ -447,8 +456,6 @@ class VersionSet {
 
   void Init(int num_levels);
 
-  void Finalize(Version* v, std::vector<uint64_t>&);
-
   void GetRange(const std::vector<FileMetaData*>& inputs,
                 InternalKey* smallest,
                 InternalKey* largest);
@@ -491,6 +498,10 @@ class VersionSet {
   Version dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;        // == dummy_versions_.prev_
 
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
+
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.
   std::string* compact_pointer_;
@@ -510,9 +521,8 @@ class VersionSet {
   // Queue of writers to the manifest file
   std::deque<ManifestWriter*> manifest_writers_;
 
-  // Store the manifest file size when it is checked.
-  // Save us the cost of checking file size twice in LogAndApply
-  uint64_t last_observed_manifest_size_;
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
 
   std::vector<FileMetaData*> obsolete_files_;
 
@@ -542,118 +552,4 @@ class VersionSet {
                            VersionEdit* edit, port::Mutex* mu);
 };
 
-// A Compaction encapsulates information about a compaction.
-class Compaction {
- public:
-  ~Compaction();
-
-  // Return the level that is being compacted.  Inputs from "level"
-  // will be merged.
-  int level() const { return level_; }
-
-  // Outputs will go to this level
-  int output_level() const { return out_level_; }
-
-  // Return the object that holds the edits to the descriptor done
-  // by this compaction.
-  VersionEdit* edit() { return edit_; }
-
-  // "which" must be either 0 or 1
-  int num_input_files(int which) const { return inputs_[which].size(); }
-
-  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
-  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
-
-  // Maximum size of files to build during this compaction.
-  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
-
-  // Whether compression will be enabled for compaction outputs
-  bool enable_compression() const { return enable_compression_; }
-
-  // Is this a trivial compaction that can be implemented by just
-  // moving a single input file to the next level (no merging or splitting)
-  bool IsTrivialMove() const;
-
-  // Add all inputs to this compaction as delete operations to *edit.
-  void AddInputDeletions(VersionEdit* edit);
-
-  // Returns true if the information we have available guarantees that
-  // the compaction is producing data in "level+1" for which no data exists
-  // in levels greater than "level+1".
-  bool IsBaseLevelForKey(const Slice& user_key);
-
-  // Returns true iff we should stop building the current output
-  // before processing "internal_key".
-  bool ShouldStopBefore(const Slice& internal_key);
-
-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
-  void Summary(char* output, int len);
-
-  // Return the score that was used to pick this compaction run.
-  double score() const { return score_; }
-
-  // Is this compaction creating a file in the bottom most level?
-  bool BottomMostLevel() { return bottommost_level_; }
-
-  // Does this compaction include all sst files?
-  bool IsFullCompaction() { return is_full_compaction_; }
-
- private:
-  friend class Version;
-  friend class VersionSet;
-
-  explicit Compaction(int level, int out_level, uint64_t target_file_size,
-    uint64_t max_grandparent_overlap_bytes, int number_levels,
-    bool seek_compaction = false, bool enable_compression = true);
-
-  int level_;
-  int out_level_; // levels to which output files are stored
-  uint64_t max_output_file_size_;
-  uint64_t maxGrandParentOverlapBytes_;
-  Version* input_version_;
-  VersionEdit* edit_;
-  int number_levels_;
-
-  bool seek_compaction_;
-  bool enable_compression_;
-
-  // Each compaction reads inputs from "level_" and "level_+1"
-  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
-
-  // State used to check for number of of overlapping grandparent files
-  // (parent == level_ + 1, grandparent == level_ + 2)
-  std::vector<FileMetaData*> grandparents_;
-  size_t grandparent_index_;  // Index in grandparent_starts_
-  bool seen_key_;             // Some output key has been seen
-  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
-                              // and grandparent files
-  int base_index_;   // index of the file in files_[level_]
-  int parent_index_; // index of some file with same range in files_[level_+1]
-  double score_;     // score that was used to pick this compaction.
-
-  // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
-  // Does this compaction include all sst files?
-  bool is_full_compaction_;
-
-  // level_ptrs_ holds indices into input_version_->levels_: our state
-  // is that we are positioned at one of the file ranges for each
-  // higher level than the ones involved in this compaction (i.e. for
-  // all L >= level_ + 2).
-  std::vector<size_t> level_ptrs_;
-
-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool);
-
-  // Initialize whether compaction producing files at the bottommost level
-  void SetupBottomMostLevel(bool isManual);
-
-  // In case of compaction error, reset the nextIndex that is used
-  // to pick up the next file to be compacted from files_by_size_
-  void ResetNextCompactionIndex();
-};
-
 }  // namespace rocksdb
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
index d13a4aed914230ae2525ee2f0b436f4e27908fe1..2ca689809dd581452e90955351cfec3a14d462be 100644
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@@ -25,7 +25,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   }
 
   Version* current_version = current_;
-  int current_levels = NumberLevels();
+  int current_levels = current_version->NumberLevels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -36,7 +36,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   int first_nonempty_level = -1;
   int first_nonempty_level_filenum = 0;
   for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = NumLevelFiles(i);
+    int file_num = current_version->NumLevelFiles(i);
     if (file_num != 0) {
       if (first_nonempty_level < 0) {
         first_nonempty_level = i;
@@ -65,6 +65,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
 
   delete[] current_version->files_;
   current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
 
   delete[] compact_pointer_;
   delete[] max_file_size_;
@@ -72,8 +73,8 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
   num_levels_ = new_levels;
   compact_pointer_ = new std::string[new_levels];
   Init(new_levels);
-  VersionEdit ve(new_levels);
-  st = LogAndApply(&ve , mu, true);
+  VersionEdit ve;
+  st = LogAndApply(&ve, mu, true);
   return st;
 }
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 4f8392d843c34dacd4306e42a415092e1b55523a..6a427f1a63c4d067353577b58247e4c03892a0c2 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -21,6 +21,7 @@
 
 #include "rocksdb/write_batch.h"
 #include "rocksdb/options.h"
+#include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/memtable.h"
@@ -234,7 +235,62 @@ class MemTableInserter : public WriteBatch::Handler {
   }
 
   virtual void Merge(const Slice& key, const Slice& value) {
-    mem_->Add(sequence_, kTypeMerge, key, value);
+    bool perform_merge = false;
+
+    if (options_->max_successive_merges > 0 && db_ != nullptr) {
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= options_->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      db_->Get(read_options, key, &get_value);
+      Slice get_value_slice = Slice(get_value);
+
+      // 2) Apply this merge
+      auto merge_operator = options_->merge_operator.get();
+      assert(merge_operator);
+
+      std::deque<std::string> operands;
+      operands.push_front(value.ToString());
+      std::string new_value;
+      if (!merge_operator->FullMerge(key,
+                                     &get_value_slice,
+                                     operands,
+                                     &new_value,
+                                     options_->info_log.get())) {
+          // Failed to merge!
+          RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES);
+
+          // Store the delta in memtable
+          perform_merge = false;
+      } else {
+        // 3) Add value to memtable
+        mem_->Add(sequence_, kTypeValue, key, new_value);
+      }
+    }
+
+    if (!perform_merge) {
+      // Add merge operator to memtable
+      mem_->Add(sequence_, kTypeMerge, key, value);
+    }
+
     sequence_++;
   }
   virtual void Delete(const Slice& key) {
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index ff9aa63eec2c2a5944e72afcb8a3bcfeafa5283f..931d8f3f596cf563b9ebeecdebbad1cd98be46ab 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -22,10 +22,11 @@ namespace rocksdb {
 static std::string PrintContents(WriteBatch* b) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
-  MemTable* mem = new MemTable(cmp, factory.get());
+  Options options;
+  options.memtable_factory = factory;
+  MemTable* mem = new MemTable(cmp, options);
   mem->Ref();
   std::string state;
-  Options options;
   Status s = WriteBatchInternal::InsertInto(b, mem, &options);
   int count = 0;
   Iterator* iter = mem->NewIterator();
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a3b18084a828656b620c404e8f25c3679ea0c644..bd22e191b5384272b5ab1a66cb5a934a077fd539 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -311,6 +311,7 @@ extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
 
 extern rocksdb_env_t* rocksdb_create_default_env();
 extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
 extern void rocksdb_env_destroy(rocksdb_env_t*);
 
 /* Universal Compaction options */
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index dd17d9e9b5c62c7888ae4db676feb917c70a5a62..4bf095756cd0a7542be232ca496eb09dd09d696d 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -199,6 +199,7 @@ class DB {
                                    uint64_t* sizes) = 0;
 
   // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
   // In particular, deleted and overwritten versions are discarded,
   // and the data is rearranged to reduce the cost of operations
   // needed to access the data.  This operation should typically only
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index e24030ddcd1a655685eb37abfca468ae32cc391e..1a16a30cc2866d48eaf81165f817414540eeead3 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -111,27 +111,23 @@ class MemTableRep {
   };
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<Iterator> GetIterator() = 0;
+  virtual Iterator* GetIterator() = 0;
 
   // Return an iterator over at least the keys with the specified user key. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
-    return GetIterator();
-  }
+  virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
 
   // Return an iterator over at least the keys with the specified prefix. The
   // iterator may also allow access to other keys, but doesn't have to. Default:
   // GetIterator().
-  virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
+  virtual Iterator* GetPrefixIterator(const Slice& prefix) {
     return GetIterator();
   }
 
   // Return an iterator that has a special Seek semantics. The result of
   // a Seek might only include keys with the same prefix as the target key.
-  virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
-    return GetIterator();
-  }
+  virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
 
  protected:
   // When *key is an internal key concatenated with the value, returns the
@@ -144,8 +140,8 @@ class MemTableRep {
 class MemTableRepFactory {
  public:
   virtual ~MemTableRepFactory() { };
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) = 0;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) = 0;
   virtual const char* Name() const = 0;
 };
 
@@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory {
   const size_t count_;
 public:
   explicit VectorRepFactory(size_t count = 0) : count_(count) { }
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) override;
   virtual const char* Name() const override {
     return "VectorRepFactory";
   }
@@ -171,8 +167,8 @@ public:
 // This uses a skip list to store keys. It is the default.
 class SkipListFactory : public MemTableRepFactory {
 public:
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+ virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                        Arena*) override;
   virtual const char* Name() const override {
     return "SkipListFactory";
   }
@@ -206,8 +202,8 @@ class TransformRepFactory : public MemTableRepFactory {
 
   virtual ~TransformRepFactory() { delete transform_; }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) override;
 
   virtual const char* Name() const override {
     return "TransformRepFactory";
@@ -247,8 +243,8 @@ public:
     : TransformRepFactory(prefix_extractor, bucket_count, num_locks)
     { }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-    MemTableRep::KeyComparator&, Arena*) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
+                                         Arena*) override;
 
   virtual const char* Name() const override {
     return "PrefixHashRepFactory";
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index c742d932d601440e8908196fa4eb1c06fba5821c..8499f60257f45b873552b5ab8a7332d122537876 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -44,15 +44,15 @@ using std::shared_ptr;
 enum CompressionType : char {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
-  kNoCompression     = 0x0,
+  kNoCompression = 0x0,
   kSnappyCompression = 0x1,
   kZlibCompression = 0x2,
   kBZip2Compression = 0x3
 };
 
 enum CompactionStyle : char {
-  kCompactionStyleLevel       = 0x0, // level based compaction style
-  kCompactionStyleUniversal   = 0x1  // Universal compaction style
+  kCompactionStyleLevel = 0x0,     // level based compaction style
+  kCompactionStyleUniversal = 0x1  // Universal compaction style
 };
 
 // Compression options for different compression algorithms like Zlib
@@ -60,12 +60,9 @@ struct CompressionOptions {
   int window_bits;
   int level;
   int strategy;
-  CompressionOptions():window_bits(-14),
-                       level(-1),
-                       strategy(0){}
-  CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
-                                                       level(lev),
-                                                       strategy(strategy){}
+  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
+  CompressionOptions(int wbits, int lev, int strategy)
+      : window_bits(wbits), level(lev), strategy(strategy) {}
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -218,7 +215,6 @@ struct Options {
   // Default: 16
   int block_restart_interval;
 
-
   // Compress blocks using the specified compression algorithm.  This
   // parameter can be changed dynamically.
   //
@@ -249,7 +245,7 @@ struct Options {
   // java/C api hard to construct.
   std::vector<CompressionType> compression_per_level;
 
-  //different options for compression algorithms
+  // different options for compression algorithms
   CompressionOptions compression_opts;
 
   // If non-nullptr, use the specified filter policy to reduce disk reads.
@@ -328,7 +324,6 @@ struct Options {
   // will be 20MB, total file size for level-2 will be 200MB,
   // and total file size for level-3 will be 2GB.
 
-
   // by default 'max_bytes_for_level_base' is 10MB.
   uint64_t max_bytes_for_level_base;
   // by default 'max_bytes_for_level_base' is 10.
@@ -486,10 +481,19 @@ struct Options {
   // order.
   int table_cache_remove_scan_count_limit;
 
-  // size of one block in arena memory allocation.
-  // If <= 0, a proper value is automatically calculated (usually 1/10 of
+  // Size of one block in arena memory allocation.
+  //
+  // If <= 0, a proper value is automatically calculated (usually about 1/10 of
   // writer_buffer_size).
   //
+  // There are two additonal restriction of the The specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
   // Default: 0
   size_t arena_block_size;
 
@@ -574,7 +578,12 @@ struct Options {
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
-  enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
+  enum {
+    NONE,
+    NORMAL,
+    SEQUENTIAL,
+    WILLNEED
+  } access_hint_on_compaction_start;
 
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
@@ -641,7 +650,6 @@ struct Options {
   // Default: 10000, if inplace_update_support = true, else 0.
   size_t inplace_update_num_locks;
 
-
   // * existing_value - pointer to previous value (from both memtable and sst).
   //                    nullptr if key doesn't exist
   // * existing_value_size - sizeof(existing_value). 0 if key doesn't exist
@@ -681,6 +689,17 @@ struct Options {
 
   // number of hash probes per key
   uint32_t memtable_prefix_bloom_probes;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  size_t max_successive_merges;
 };
 
 //
@@ -691,7 +710,7 @@ struct Options {
 // the block cache. It will not page in data from the OS cache or data that
 // resides in storage.
 enum ReadTier {
-  kReadAllTier    = 0x0, // data in memtable, block cache, OS cache or storage
+  kReadAllTier = 0x0,    // data in memtable, block cache, OS cache or storage
   kBlockCacheTier = 0x1  // data in memtable or block cache
 };
 
@@ -744,13 +763,14 @@ struct ReadOptions {
         prefix_seek(false),
         snapshot(nullptr),
         prefix(nullptr),
-        read_tier(kReadAllTier) {
-  }
-  ReadOptions(bool cksum, bool cache) :
-              verify_checksums(cksum), fill_cache(cache),
-              prefix_seek(false), snapshot(nullptr), prefix(nullptr),
-              read_tier(kReadAllTier) {
-  }
+        read_tier(kReadAllTier) {}
+  ReadOptions(bool cksum, bool cache)
+      : verify_checksums(cksum),
+        fill_cache(cache),
+        prefix_seek(false),
+        snapshot(nullptr),
+        prefix(nullptr),
+        read_tier(kReadAllTier) {}
 };
 
 // Options that control write operations
@@ -777,10 +797,7 @@ struct WriteOptions {
   // and the write may got lost after a crash.
   bool disableWAL;
 
-  WriteOptions()
-      : sync(false),
-        disableWAL(false) {
-  }
+  WriteOptions() : sync(false), disableWAL(false) {}
 };
 
 // Options that control flush operations
@@ -789,9 +806,7 @@ struct FlushOptions {
   // Default: true
   bool wait;
 
-  FlushOptions()
-      : wait(true) {
-  }
+  FlushOptions() : wait(true) {}
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 1c6210fe9f913f4640d204611dda2ad0c1e3808c..2cfb731f63fb7e2111eedb8a582b20eea71f36da 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -88,7 +88,10 @@ class WriteBatch {
   Status Iterate(Handler* handler) const;
 
   // Retrieve the serialized version of this batch.
-  std::string Data() { return rep_; }
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
 
   // Returns the number of updates in the batch
   int Count() const;
diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h
index 335e0285762af93419a04999cf99bafece37c01b..fbe2ae8a37e2e9dd65a1def8ff59fb82a6e4ecb8 100644
--- a/include/utilities/backupable_db.h
+++ b/include/utilities/backupable_db.h
@@ -31,6 +31,14 @@ struct BackupableDBOptions {
   // Default: nullptr
   Env* backup_env;
 
+  // If share_table_files == true, backup will assume that table files with
+  // same name have the same contents. This enables incremental backups and
+  // avoids unnecessary data copies.
+  // If share_table_files == false, each backup will be on its own and will
+  // not share any data with other backups.
+  // default: true
+  bool share_table_files;
+
   // Backup info and error messages will be written to info_log
   // if non-nullptr.
   // Default: nullptr
@@ -49,6 +57,7 @@ struct BackupableDBOptions {
 
   explicit BackupableDBOptions(const std::string& _backup_dir,
                                Env* _backup_env = nullptr,
+                               bool _share_table_files = true,
                                Logger* _info_log = nullptr,
                                bool _sync = true,
                                bool _destroy_old_data = false) :
@@ -93,6 +102,14 @@ class BackupableDB : public StackableDB {
   Status PurgeOldBackups(uint32_t num_backups_to_keep);
   // deletes a specific backup
   Status DeleteBackup(BackupID backup_id);
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediatelly, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up
+  // next time you create BackupableDB or RestoreBackupableDB.
+  void StopBackup();
 
  private:
   BackupEngine* backup_engine_;
@@ -108,9 +125,10 @@ class RestoreBackupableDB {
    void GetBackupInfo(std::vector<BackupInfo>* backup_info);
 
    // restore from backup with backup_id
-   // IMPORTANT -- if you restore from some backup that is not the latest,
-   // and you start creating new backups from the new DB, all the backups
-   // that were newer than the backup you restored from will be deleted
+   // IMPORTANT -- if options_.share_table_files == true and you restore DB
+   // from some backup that is not the latest, and you start creating new
+   // backups from the new DB, all the backups that were newer than the
+   // backup you restored from will be deleted
    //
    // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
    // If you try creating a new backup now, old backups 4 and 5 will be deleted
diff --git a/table/table_test.cc b/table/table_test.cc
index c6f7d227512a770796deb23714a63e9aace00c05..290efea0d9e3352b42308089e479835430c7f2e0 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -394,7 +394,9 @@ class MemTableConstructor: public Constructor {
       : Constructor(cmp),
         internal_comparator_(cmp),
         table_factory_(new SkipListFactory) {
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, options);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -402,7 +404,9 @@ class MemTableConstructor: public Constructor {
   }
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     delete memtable_->Unref();
-    memtable_ = new MemTable(internal_comparator_, table_factory_.get());
+    Options memtable_options;
+    memtable_options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_, memtable_options);
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1381,10 +1385,11 @@ class MemTableTest { };
 TEST(MemTableTest, Simple) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto table_factory = std::make_shared<SkipListFactory>();
-  MemTable* memtable = new MemTable(cmp, table_factory.get());
+  Options options;
+  options.memtable_factory = table_factory;
+  MemTable* memtable = new MemTable(cmp, options);
   memtable->Ref();
   WriteBatch batch;
-  Options options;
   WriteBatchInternal::SetSequence(&batch, 100);
   batch.Put(std::string("k1"), std::string("v1"));
   batch.Put(std::string("k2"), std::string("v2"));
diff --git a/util/arena_impl.cc b/util/arena_impl.cc
index d5c2a537e2af3c9b722703ade9c61e3291c4da10..5125e2364113fffa33bc195d5c90c7c7e1eab919 100644
--- a/util/arena_impl.cc
+++ b/util/arena_impl.cc
@@ -8,71 +8,86 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/arena_impl.h"
+#include <algorithm>
 
 namespace rocksdb {
 
-ArenaImpl::ArenaImpl(size_t block_size) {
-  if (block_size < kMinBlockSize) {
-    block_size_ = kMinBlockSize;
-  } else if (block_size > kMaxBlockSize) {
-    block_size_ = kMaxBlockSize;
-  } else {
-    block_size_ = block_size;
+const size_t ArenaImpl::kMinBlockSize = 4096;
+const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
+static const int kAlignUnit = sizeof(void*);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
+  block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
   }
 
-  blocks_memory_ = 0;
-  alloc_ptr_ = nullptr;  // First allocation will allocate a block
-  alloc_bytes_remaining_ = 0;
+  return block_size;
+}
+
+ArenaImpl::ArenaImpl(size_t block_size)
+    : kBlockSize(OptimizeBlockSize(block_size)) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
 }
 
 ArenaImpl::~ArenaImpl() {
-  for (size_t i = 0; i < blocks_.size(); i++) {
-    delete[] blocks_[i];
+  for (const auto& block : blocks_) {
+    delete[] block;
   }
 }
 
-char* ArenaImpl::AllocateFallback(size_t bytes) {
-  if (bytes > block_size_ / 4) {
+char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
     // Object is more than a quarter of our block size.  Allocate it separately
     // to avoid wasting too much space in leftover bytes.
-    char* result = AllocateNewBlock(bytes);
-    return result;
+    return AllocateNewBlock(bytes);
   }
 
   // We waste the remaining space in the current block.
-  alloc_ptr_ = AllocateNewBlock(block_size_);
-  alloc_bytes_remaining_ = block_size_;
+  auto block_head = AllocateNewBlock(kBlockSize);
+  alloc_bytes_remaining_ = kBlockSize - bytes;
 
-  char* result = alloc_ptr_;
-  alloc_ptr_ += bytes;
-  alloc_bytes_remaining_ -= bytes;
-  return result;
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    return unaligned_alloc_ptr_;
+  }
 }
 
 char* ArenaImpl::AllocateAligned(size_t bytes) {
-  const int align = sizeof(void*);    // We'll align to pointer size
-  assert((align & (align-1)) == 0);   // Pointer size should be a power of 2
-  size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
-  size_t slop = (current_mod == 0 ? 0 : align - current_mod);
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
   size_t needed = bytes + slop;
   char* result;
   if (needed <= alloc_bytes_remaining_) {
-    result = alloc_ptr_ + slop;
-    alloc_ptr_ += needed;
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
     alloc_bytes_remaining_ -= needed;
   } else {
     // AllocateFallback always returned aligned memory
-    result = AllocateFallback(bytes);
+    result = AllocateFallback(bytes, true /* aligned */);
   }
-  assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0);
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
   return result;
 }
 
 char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
-  char* result = new char[block_bytes];
+  char* block = new char[block_bytes];
   blocks_memory_ += block_bytes;
-  blocks_.push_back(result);
-  return result;
+  blocks_.push_back(block);
+  return block;
 }
 
 }  // namespace rocksdb
diff --git a/util/arena_impl.h b/util/arena_impl.h
index b5a6842472a37b847b31006f40a5202e502762ec..538385ccc03845b29ef313fc84fe216ce5814400 100644
--- a/util/arena_impl.h
+++ b/util/arena_impl.h
@@ -22,49 +22,54 @@ namespace rocksdb {
 
 class ArenaImpl : public Arena {
  public:
+  // No copying allowed
+  ArenaImpl(const ArenaImpl&) = delete;
+  void operator=(const ArenaImpl&) = delete;
+
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
   explicit ArenaImpl(size_t block_size = kMinBlockSize);
   virtual ~ArenaImpl();
 
-  virtual char* Allocate(size_t bytes);
+  virtual char* Allocate(size_t bytes) override;
 
-  virtual char* AllocateAligned(size_t bytes);
+  virtual char* AllocateAligned(size_t bytes) override;
 
   // Returns an estimate of the total memory usage of data allocated
-  // by the arena (including space allocated but not yet used for user
+  // by the arena (exclude the space allocated but not yet used for future
   // allocations).
-  //
-  // TODO: Do we need to exclude space allocated but not used?
   virtual const size_t ApproximateMemoryUsage() {
-    return blocks_memory_ + blocks_.capacity() * sizeof(char*);
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
   }
 
-  virtual const size_t MemoryAllocatedBytes() {
+  virtual const size_t MemoryAllocatedBytes() override {
     return blocks_memory_;
   }
 
  private:
-  char* AllocateFallback(size_t bytes);
-  char* AllocateNewBlock(size_t block_bytes);
-
-  static const size_t kMinBlockSize = 4096;
-  static const size_t kMaxBlockSize = 2 << 30;
-
   // Number of bytes allocated in one block
-  size_t block_size_;
-
-  // Allocation state
-  char* alloc_ptr_;
-  size_t alloc_bytes_remaining_;
-
+  const size_t kBlockSize;
   // Array of new[] allocated memory blocks
-  std::vector<char*> blocks_;
+  typedef std::vector<char*> Blocks;
+  Blocks blocks_;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+  char* AllocateFallback(size_t bytes, bool aligned);
+  char* AllocateNewBlock(size_t block_bytes);
 
   // Bytes of memory in blocks allocated so far
-  size_t blocks_memory_;
-
-  // No copying allowed
-  ArenaImpl(const ArenaImpl&);
-  void operator=(const ArenaImpl&);
+  size_t blocks_memory_ = 0;
 };
 
 inline char* ArenaImpl::Allocate(size_t bytes) {
@@ -73,12 +78,16 @@ inline char* ArenaImpl::Allocate(size_t bytes) {
   // them for our internal use).
   assert(bytes > 0);
   if (bytes <= alloc_bytes_remaining_) {
-    char* result = alloc_ptr_;
-    alloc_ptr_ += bytes;
+    unaligned_alloc_ptr_ -= bytes;
     alloc_bytes_remaining_ -= bytes;
-    return result;
+    return unaligned_alloc_ptr_;
   }
-  return AllocateFallback(bytes);
+  return AllocateFallback(bytes, false /* unaligned */);
 }
 
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
 }  // namespace rocksdb
diff --git a/util/arena_test.cc b/util/arena_test.cc
index 12aa7f7fe51b3b525a57ea62bef614a5c6304ae2..ca6dfc99d6ea1208a92f2c0b6c43ab49e324eb33 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -57,8 +57,34 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
   ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
 }
 
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+	const size_t kZero = 0;
+  ArenaImpl arena(kBlockSize);
+  ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  ASSERT_EQ(mem_usage, kBlockSize);
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    usage = arena.ApproximateMemoryUsage();
+  }
+  ASSERT_GT(usage, mem_usage);
+}
+
 TEST(ArenaImplTest, Simple) {
-  std::vector<std::pair<size_t, char*> > allocated;
+  std::vector<std::pair<size_t, char*>> allocated;
   ArenaImpl arena_impl;
   const int N = 100000;
   size_t bytes = 0;
@@ -68,8 +94,9 @@ TEST(ArenaImplTest, Simple) {
     if (i % (N / 10) == 0) {
       s = i;
     } else {
-      s = rnd.OneIn(4000) ? rnd.Uniform(6000) :
-          (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
     }
     if (s == 0) {
       // Our arena disallows size 0 allocations.
@@ -89,7 +116,7 @@ TEST(ArenaImplTest, Simple) {
     bytes += s;
     allocated.push_back(std::make_pair(s, r));
     ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
-    if (i > N/10) {
+    if (i > N / 10) {
       ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
     }
   }
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index 214ac2c55d8a09d51a7096eeadcc8216cd1a6097..9c2d8f52c23683f4a9a9ee48e5a97862ba2487a3 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -66,17 +66,15 @@ class HashLinkListRep : public MemTableRep {
 
   virtual ~HashLinkListRep();
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
-      const Slice& slice) override;
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
-      const Slice& prefix) override;
-
-  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+  virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
       override;
 
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
  private:
   friend class DynamicIterator;
   typedef SkipList<const char*, MemTableRep::KeyComparator&> FullList;
@@ -298,8 +296,6 @@ class HashLinkListRep : public MemTableRep {
     virtual void SeekToLast() { }
    private:
   };
-
-  std::shared_ptr<EmptyIterator> empty_iterator_;
 };
 
 HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare,
@@ -308,9 +304,7 @@ HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare,
   : bucket_size_(bucket_size),
     transform_(transform),
     compare_(compare),
-    arena_(arena),
-    empty_iterator_(std::make_shared<EmptyIterator>()) {
-
+    arena_(arena) {
   char* mem = arena_->AllocateAligned(
       sizeof(port::AtomicPointer) * bucket_size);
 
@@ -389,7 +383,7 @@ size_t HashLinkListRep::ApproximateMemoryUsage() {
   return 0;
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetIterator() {
+MemTableRep::Iterator* HashLinkListRep::GetIterator() {
   auto list = new FullList(compare_, arena_);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -400,26 +394,24 @@ std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetIterator() {
       }
     }
   }
-  return std::make_shared<FullListIterator>(list);
+  return new FullListIterator(list);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetPrefixIterator(
+MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator(
   const Slice& prefix) {
   auto bucket = GetBucket(prefix);
   if (bucket == nullptr) {
-    return empty_iterator_;
+    return new EmptyIterator();
   }
-  return std::make_shared<Iterator>(this, bucket);
+  return new Iterator(this, bucket);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashLinkListRep::GetIterator(
-    const Slice& slice) {
+MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) {
   return GetPrefixIterator(transform_->Transform(slice));
 }
 
-std::shared_ptr<MemTableRep::Iterator>
-    HashLinkListRep::GetDynamicPrefixIterator() {
-  return std::make_shared<DynamicIterator>(*this);
+MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
 }
 
 bool HashLinkListRep::BucketContains(Node* head, const Key& key) const {
@@ -450,10 +442,9 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
 
 } // anon namespace
 
-std::shared_ptr<MemTableRep> HashLinkListRepFactory::CreateMemTableRep(
+MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
     MemTableRep::KeyComparator& compare, Arena* arena) {
-  return std::make_shared<HashLinkListRep>(compare, arena, transform_,
-                                           bucket_count_);
+  return new HashLinkListRep(compare, arena, transform_, bucket_count_);
 }
 
 MemTableRepFactory* NewHashLinkListRepFactory(
diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h
index 16d2517c250e66cc5bd8c88dfb250a9ff79cb75d..efa9d8f2ec31e49e971509943b319e5a594a88aa 100644
--- a/util/hash_linklist_rep.h
+++ b/util/hash_linklist_rep.h
@@ -22,8 +22,8 @@ class HashLinkListRepFactory : public MemTableRepFactory {
 
   virtual ~HashLinkListRepFactory() { delete transform_; }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-      MemTableRep::KeyComparator& compare, Arena* arena) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
+                                         Arena* arena) override;
 
   virtual const char* Name() const override {
     return "HashLinkListRepFactory";
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 95e6edfae947f348ac547ecd4de9b29b7aa85973..906f8303035fb96b443a2fc06928a1cc2a2f0301 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -33,17 +33,15 @@ class HashSkipListRep : public MemTableRep {
 
   virtual ~HashSkipListRep();
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator(
-      const Slice& slice) override;
+  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator(
-      const Slice& prefix) override;
-
-  virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
+  virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
       override;
 
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+
  private:
   friend class DynamicIterator;
   typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket;
@@ -216,22 +214,18 @@ class HashSkipListRep : public MemTableRep {
     virtual void SeekToLast() { }
    private:
   };
-
-  std::shared_ptr<EmptyIterator> empty_iterator_;
 };
 
 HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
                                  Arena* arena, const SliceTransform* transform,
                                  size_t bucket_size, int32_t skiplist_height,
                                  int32_t skiplist_branching_factor)
-  : bucket_size_(bucket_size),
-    skiplist_height_(skiplist_height),
-    skiplist_branching_factor_(skiplist_branching_factor),
-    transform_(transform),
-    compare_(compare),
-    arena_(arena),
-    empty_iterator_(std::make_shared<EmptyIterator>()) {
-
+    : bucket_size_(bucket_size),
+      skiplist_height_(skiplist_height),
+      skiplist_branching_factor_(skiplist_branching_factor),
+      transform_(transform),
+      compare_(compare),
+      arena_(arena) {
   buckets_ = new port::AtomicPointer[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
@@ -276,7 +270,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() {
   return sizeof(buckets_);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
+MemTableRep::Iterator* HashSkipListRep::GetIterator() {
   auto list = new Bucket(compare_, arena_);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -287,35 +281,31 @@ std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
       }
     }
   }
-  return std::make_shared<Iterator>(list);
+  return new Iterator(list);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetPrefixIterator(
-  const Slice& prefix) {
+MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) {
   auto bucket = GetBucket(prefix);
   if (bucket == nullptr) {
-    return empty_iterator_;
+    return new EmptyIterator();
   }
-  return std::make_shared<Iterator>(bucket, false);
+  return new Iterator(bucket, false);
 }
 
-std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator(
-    const Slice& slice) {
+MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
   return GetPrefixIterator(transform_->Transform(slice));
 }
 
-std::shared_ptr<MemTableRep::Iterator>
-    HashSkipListRep::GetDynamicPrefixIterator() {
-  return std::make_shared<DynamicIterator>(*this);
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
+  return new DynamicIterator(*this);
 }
 
 } // anon namespace
 
-std::shared_ptr<MemTableRep> HashSkipListRepFactory::CreateMemTableRep(
+MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
     MemTableRep::KeyComparator& compare, Arena* arena) {
-  return std::make_shared<HashSkipListRep>(compare, arena, transform_,
-                                           bucket_count_, skiplist_height_,
-                                           skiplist_branching_factor_);
+  return new HashSkipListRep(compare, arena, transform_, bucket_count_,
+                             skiplist_height_, skiplist_branching_factor_);
 }
 
 MemTableRepFactory* NewHashSkipListRepFactory(
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
index 3411f66b158a5beb7f7a47b54a79ea20c7f0f3a0..1ea844eda54be73a22e0299f373eaa1d40700fff 100644
--- a/util/hash_skiplist_rep.h
+++ b/util/hash_skiplist_rep.h
@@ -26,8 +26,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
 
   virtual ~HashSkipListRepFactory() { delete transform_; }
 
-  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
-      MemTableRep::KeyComparator& compare, Arena* arena) override;
+  virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
+                                         Arena* arena) override;
 
   virtual const char* Name() const override {
     return "HashSkipListRepFactory";
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 58d81460e9d7572188565f70eb3e9a26ed841b03..65ecd61a261ce36ce2439b814d1037407204e531 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   }
   int max = -1;
   for (int i = 0; i < versions.NumberLevels(); i++) {
-    if (versions.NumLevelFiles(i)) {
+    if (versions.current()->NumLevelFiles(i)) {
       max = i;
     }
   }
diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc
index ebe1339e53969300efb13a816716c919b70ca0eb..dd615f05704bed391d4d9b2372c68e6e2bf5bd60 100644
--- a/util/manual_compaction_test.cc
+++ b/util/manual_compaction_test.cc
@@ -9,9 +9,13 @@
 #include <cstdlib>
 
 #include "rocksdb/db.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/testharness.h"
 
+using namespace rocksdb;
+
 namespace {
 
 const int kNumKeys = 1100000;
@@ -26,12 +30,71 @@ std::string Key2(int i) {
   return Key1(i) + "_xxx";
 }
 
-class ManualCompactionTest { };
+class ManualCompactionTest {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    DestroyDB(dbname_, rocksdb::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const {
+    return existing_value.ToString() == "destroy";
+  }
+
+  virtual const char* Name() const {
+    return "DestroyAllCompactionFilter";
+  }
+};
+
+TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = rocksdb::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
 
 TEST(ManualCompactionTest, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
-  DestroyDB(dbpath, rocksdb::Options());
 
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
@@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) {
   rocksdb::Options db_options;
   db_options.create_if_missing = true;
   db_options.compression = rocksdb::kNoCompression;
-  ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db));
+  ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
 
   // create first key range
   rocksdb::WriteBatch batch;
@@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) {
 
   // close database
   delete db;
-  DestroyDB(dbpath, rocksdb::Options());
+  DestroyDB(dbname_, rocksdb::Options());
 }
 
 }  // anonymous namespace
diff --git a/util/options.cc b/util/options.cc
index 93aa268f15d84f3faf35cc0972d5da07558c71b8..918b87281d9e70d05daac7987127ba0f638570eb 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -104,7 +104,8 @@ Options::Options()
       inplace_update_num_locks(10000),
       inplace_callback(nullptr),
       memtable_prefix_bloom_bits(0),
-      memtable_prefix_bloom_probes(6) {
+      memtable_prefix_bloom_probes(6),
+      max_successive_merges(0) {
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -300,6 +301,8 @@ Options::Dump(Logger* log) const
         memtable_prefix_bloom_bits);
     Log(log, "            Options.memtable_prefix_bloom_probes: %d",
         memtable_prefix_bloom_probes);
+    Log(log, "                   Options.max_successive_merges: %zd",
+        max_successive_merges);
 }   // Options::Dump
 
 //
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index f4c6e0c93f400a5c7d5497cd30b0218dc1f85a9b..6f1fb1a157122fb4fd82bf5a66ee080e589e78ef 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -97,15 +97,15 @@ public:
   // Unhide default implementations of GetIterator
   using MemTableRep::GetIterator;
 
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override {
-    return std::make_shared<SkipListRep::Iterator>(&skip_list_);
+  virtual MemTableRep::Iterator* GetIterator() override {
+    return new SkipListRep::Iterator(&skip_list_);
   }
 };
 }
 
-std::shared_ptr<MemTableRep> SkipListFactory::CreateMemTableRep (
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-    return std::shared_ptr<MemTableRep>(new SkipListRep(compare, arena));
+MemTableRep* SkipListFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new SkipListRep(compare, arena);
 }
 
 } // namespace rocksdb
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index 3887f356d587477bf8c4e2fdc7d16b4339cd6a21..bd2f0873d1c5144c21d6b175b1e5e70db5fd0251 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -90,7 +90,7 @@ class VectorRep : public MemTableRep {
   using MemTableRep::GetIterator;
 
   // Return an iterator over the keys in this representation.
-  virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator() override;
 
  private:
   friend class Iterator;
@@ -233,22 +233,22 @@ void VectorRep::Iterator::SeekToLast() {
   }
 }
 
-std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() {
+MemTableRep::Iterator* VectorRep::GetIterator() {
   ReadLock l(&rwlock_);
   // Do not sort here. The sorting would be done the first time
   // a Seek is performed on the iterator.
   if (immutable_) {
-    return std::make_shared<Iterator>(this, bucket_, compare_);
+    return new Iterator(this, bucket_, compare_);
   } else {
     std::shared_ptr<Bucket> tmp;
     tmp.reset(new Bucket(*bucket_)); // make a copy
-    return std::make_shared<Iterator>(nullptr, tmp, compare_);
+    return new Iterator(nullptr, tmp, compare_);
   }
 }
 } // anon namespace
 
-std::shared_ptr<MemTableRep> VectorRepFactory::CreateMemTableRep(
-  MemTableRep::KeyComparator& compare, Arena* arena) {
-  return std::make_shared<VectorRep>(compare, arena, count_);
+MemTableRep* VectorRepFactory::CreateMemTableRep(
+    MemTableRep::KeyComparator& compare, Arena* arena) {
+  return new VectorRep(compare, arena, count_);
 }
 } // namespace rocksdb
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 61e009cd31e985060df171627eda5a1382213e78..26bdd254b52efe264c9730162dbe513cb832876e 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -20,6 +20,7 @@
 #include <map>
 #include <string>
 #include <limits>
+#include <atomic>
 
 namespace rocksdb {
 
@@ -31,6 +32,9 @@ class BackupEngine {
   Status CreateNewBackup(DB* db, bool flush_before_backup = false);
   Status PurgeOldBackups(uint32_t num_backups_to_keep);
   Status DeleteBackup(BackupID backup_id);
+  void StopBackup() {
+    stop_backup_.store(true, std::memory_order_release);
+  }
 
   void GetBackupInfo(std::vector<BackupInfo>* backup_info);
   Status RestoreDBFromBackup(BackupID backup_id, const std::string &db_dir,
@@ -106,13 +110,16 @@ class BackupEngine {
     return "private";
   }
   inline std::string GetPrivateFileRel(BackupID backup_id,
-                                       const std::string &file = "") const {
+                                       bool tmp = false,
+                                       const std::string& file = "") const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetPrivateDirRel() + "/" + std::to_string(backup_id) + "/" + file;
+    return GetPrivateDirRel() + "/" + std::to_string(backup_id) +
+           (tmp ? ".tmp" : "") + "/" + file;
   }
-  inline std::string GetSharedFileRel(const std::string& file = "") const {
+  inline std::string GetSharedFileRel(const std::string& file = "",
+                                      bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return "shared/" + file;
+    return "shared/" + file + (tmp ? ".tmp" : "");
   }
   inline std::string GetLatestBackupFile(bool tmp = false) const {
     return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : ""));
@@ -151,6 +158,7 @@ class BackupEngine {
   std::map<BackupID, BackupMeta> backups_;
   std::unordered_map<std::string, int> backuped_file_refs_;
   std::vector<BackupID> obsolete_backups_;
+  std::atomic<bool> stop_backup_;
 
   // options data
   BackupableDBOptions options_;
@@ -161,13 +169,17 @@ class BackupEngine {
 };
 
 BackupEngine::BackupEngine(Env* db_env, const BackupableDBOptions& options)
-  : options_(options),
-    db_env_(db_env),
-    backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_) {
+    : stop_backup_(false),
+      options_(options),
+      db_env_(db_env),
+      backup_env_(options.backup_env != nullptr ? options.backup_env
+                                                : db_env_) {
 
   // create all the dirs we need
   backup_env_->CreateDirIfMissing(GetAbsolutePath());
-  backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
+  if (options_.share_table_files) {
+    backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
+  }
   backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));
   backup_env_->CreateDirIfMissing(GetBackupMetaDir());
 
@@ -298,8 +310,9 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
   Log(options_.info_log, "Started the backup process -- creating backup %u",
       new_backup_id);
 
-  // create private dir
-  s = backup_env_->CreateDir(GetAbsolutePath(GetPrivateFileRel(new_backup_id)));
+  // create temporary private dir
+  s = backup_env_->CreateDir(
+      GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)));
 
   // copy live_files
   for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
@@ -320,7 +333,7 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
     // * if it's kDescriptorFile, limit the size to manifest_file_size
     s = BackupFile(new_backup_id,
                    &new_backup,
-                   type == kTableFile,       /* shared  */
+                   options_.share_table_files && type == kTableFile,
                    db->GetName(),            /* src_dir */
                    live_files[i],            /* src_fname */
                    (type == kDescriptorFile) ? manifest_file_size : 0);
@@ -342,6 +355,13 @@ Status BackupEngine::CreateNewBackup(DB* db, bool flush_before_backup) {
   // we copied all the files, enable file deletions
   db->EnableFileDeletions();
 
+  if (s.ok()) {
+    // move tmp private backup to real backup folder
+    s = backup_env_->RenameFile(
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
+  }
+
   if (s.ok()) {
     // persist the backup metadata on the disk
     s = new_backup.StoreToFile(options_.sync);
@@ -561,6 +581,9 @@ Status BackupEngine::CopyFile(const std::string& src,
   Slice data;
 
   do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return Status::Incomplete("Backup stopped");
+    }
     size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
       copy_file_buffer_size_ : size_limit;
     s = src_file->Read(buffer_to_read, &data, buf.get());
@@ -590,12 +613,16 @@ Status BackupEngine::BackupFile(BackupID backup_id,
 
   assert(src_fname.size() > 0 && src_fname[0] == '/');
   std::string dst_relative = src_fname.substr(1);
+  std::string dst_relative_tmp;
   if (shared) {
-    dst_relative = GetSharedFileRel(dst_relative);
+    dst_relative_tmp = GetSharedFileRel(dst_relative, true);
+    dst_relative = GetSharedFileRel(dst_relative, false);
   } else {
-    dst_relative = GetPrivateFileRel(backup_id, dst_relative);
+    dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative);
+    dst_relative = GetPrivateFileRel(backup_id, false, dst_relative);
   }
   std::string dst_path = GetAbsolutePath(dst_relative);
+  std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp);
   Status s;
   uint64_t size;
 
@@ -607,12 +634,15 @@ Status BackupEngine::BackupFile(BackupID backup_id,
   } else {
     Log(options_.info_log, "Copying %s", src_fname.c_str());
     s = CopyFile(src_dir + src_fname,
-                 dst_path,
+                 dst_path_tmp,
                  db_env_,
                  backup_env_,
                  options_.sync,
                  &size,
                  size_limit);
+    if (s.ok() && shared) {
+      s = backup_env_->RenameFile(dst_path_tmp, dst_path);
+    }
   }
   if (s.ok()) {
     backup->AddFile(dst_relative, size);
@@ -671,14 +701,16 @@ void BackupEngine::GarbageCollection(bool full_scan) {
                              &private_children);
     for (auto& child : private_children) {
       BackupID backup_id = 0;
+      bool tmp_dir = child.find(".tmp") != std::string::npos;
       sscanf(child.c_str(), "%u", &backup_id);
-      if (backup_id == 0 || backups_.find(backup_id) != backups_.end()) {
+      if (!tmp_dir && // if it's tmp_dir, delete it
+          (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
         // it's either not a number or it's still alive. continue
         continue;
       }
       // here we have to delete the dir and all its children
       std::string full_private_path =
-          GetAbsolutePath(GetPrivateFileRel(backup_id));
+          GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
       std::vector<std::string> subchildren;
       backup_env_->GetChildren(full_private_path, &subchildren);
       for (auto& subchild : subchildren) {
@@ -813,7 +845,9 @@ Status BackupEngine::BackupMeta::StoreToFile(bool sync) {
 
 BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
     : StackableDB(db), backup_engine_(new BackupEngine(db->GetEnv(), options)) {
-  backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber());
+  if (options.share_table_files) {
+    backup_engine_->DeleteBackupsNewerThan(GetLatestSequenceNumber());
+  }
 }
 
 BackupableDB::~BackupableDB() {
@@ -836,6 +870,10 @@ Status BackupableDB::DeleteBackup(BackupID backup_id) {
   return backup_engine_->DeleteBackup(backup_id);
 }
 
+void BackupableDB::StopBackup() {
+  backup_engine_->StopBackup();
+}
+
 // --- RestoreBackupableDB methods ------
 
 RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index c64f0170b8bd7e11d927170547f6df987c027a55..de240558f97c4e0fa4ede80f25aef99a41350677 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -305,7 +305,7 @@ class BackupableDBTest {
     CreateLoggerFromOptions(dbname_, backupdir_, env_,
                             Options(), &logger_);
     backupable_options_.reset(new BackupableDBOptions(
-        backupdir_, test_backup_env_.get(), logger_.get(), true));
+        backupdir_, test_backup_env_.get(), true, logger_.get(), true));
 
     // delete old files in db
     DestroyDB(dbname_, Options());
@@ -317,7 +317,8 @@ class BackupableDBTest {
     return db;
   }
 
-  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false) {
+  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false,
+                        bool share_table_files = true) {
     // reset all the defaults
     test_backup_env_->SetLimitWrittenFiles(1000000);
     test_db_env_->SetLimitWrittenFiles(1000000);
@@ -331,6 +332,7 @@ class BackupableDBTest {
       ASSERT_OK(DB::Open(options_, dbname_, &db));
     }
     backupable_options_->destroy_old_data = destroy_old_data;
+    backupable_options_->share_table_files = share_table_files;
     db_.reset(new BackupableDB(db, *backupable_options_));
   }
 
@@ -659,6 +661,38 @@ TEST(BackupableDBTest, DeleteNewerBackups) {
   CloseRestoreDB();
 }
 
+TEST(BackupableDBTest, NoShareTableFiles) {
+  const int keys_iteration = 5000;
+  OpenBackupableDB(true, false, false);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+  }
+  CloseBackupableDB();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+TEST(BackupableDBTest, DeleteTmpFiles) {
+  OpenBackupableDB();
+  CloseBackupableDB();
+  std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp";
+  std::string private_tmp_dir = backupdir_ + "/private/10.tmp";
+  std::string private_tmp_file = private_tmp_dir + "/00003.sst";
+  file_manager_->WriteToFile(shared_tmp, "tmp");
+  file_manager_->CreateDir(private_tmp_dir);
+  file_manager_->WriteToFile(private_tmp_file, "tmp");
+  ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir));
+  OpenBackupableDB();
+  CloseBackupableDB();
+  ASSERT_EQ(false, file_manager_->FileExists(shared_tmp));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file));
+  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir));
+}
+
 } // anon namespace
 
 } //  namespace rocksdb