Added bloom filter support.

In particular, we add a new FilterPolicy class. An instance of this class can be supplied in Options when opening a database. If supplied, the instance is used to generate summaries of keys (e.g., a bloom filter) which are placed in sstables. These summaries are consulted by DB::Get() so we can avoid reading sstable blocks that are guaranteed to not contain the key we are looking for. This change provides one implementation of FilterPolicy based on bloom filters. Other changes: - Updated version number to 1.4. - Some build tweaks. - C binding for CompactRange. - A few more benchmarks: deleteseq, deleterandom, readmissing, seekrandom. - Minor .gitignore update.

Added bloom filter support.
In particular, we add a new FilterPolicy class. An instance of this class can be supplied in Options when opening a database. If supplied, the instance is used to generate summaries of keys (e.g., a bloom filter) which are placed in sstables. These summaries are consulted by DB::Get() so we can avoid reading sstable blocks that are guaranteed to not contain the key we are looking for. This change provides one implementation of FilterPolicy based on bloom filters. Other changes: - Updated version number to 1.4. - Some build tweaks. - C binding for CompactRange. - A few more benchmarks: deleteseq, deleterandom, readmissing, seekrandom. - Minor .gitignore update.
85584d49 · Sanjay Ghemawat · bc1ee4d2 · 85584d49 · 85584d49 · 85584d49
38 changed file
--- a/.gitignore
+++ b/.gitignore
 build_config.mk
 *.a
 *.o
+*.dylib*
+*.so
+*.so.*
 *_test
 db_bench
--- a/Makefile
+++ b/Makefile
@@ -17,8 +17,8 @@ OPT ?= -O2 -DNDEBUG       # (A) Production use (optimized mode)
 #-----------------------------------------------

 # detect what platform we're building on
-$(shell sh ./build_detect_platform)
-# this file is generated by build_detect_platform to set build flags and sources
+$(shell ./build_detect_platform build_config.mk)
+# this file is generated by the previous line to set build flags and sources
 include build_config.mk

 CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
@@ -34,6 +34,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL)

 TESTS = \
 	arena_test \
+	bloom_test \
 	c_test \
 	cache_test \
 	coding_test \
@@ -43,6 +44,7 @@ TESTS = \
 	dbformat_test \
 	env_test \
 	filename_test \
+	filter_block_test \
 	log_test \
 	memenv_test \
 	skiplist_test \
@@ -63,7 +65,7 @@ default: all
 ifneq ($(PLATFORM_SHARED_EXT),)
 # Update db.h if you change these.
 SHARED_MAJOR = 1
-SHARED_MINOR = 3
+SHARED_MINOR = 4
 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@@ -101,6 +103,9 @@ db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
 arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)

+bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
+
 c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)

@@ -128,6 +133,9 @@ env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)

+filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
+
 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)


--- a/build_detect_platform
+++ b/build_detect_platform
 #!/bin/sh
 #
-# Detects OS we're compiling on and generates build_config.mk,
-# which in turn gets read while processing Makefile.
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
 #
-# build_config.mk will set the following variables:
+# The output will set the following variables:
 #   PLATFORM_LDFLAGS            Linker flags
 #   PLATFORM_SHARED_EXT         Extension for shared libraries
 #   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
@@ -13,11 +13,15 @@
 #       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
 #       -DLEVELDB_PLATFORM_NOATOMIC if it is not

-SCRIPT_DIR=`dirname $0`
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>"
+  exit 1
+fi

-# Delete existing build_config.mk
-rm -f build_config.mk
-touch build_config.mk
+# Delete existing output, if it exists
+rm -f $OUTPUT
+touch $OUTPUT

 if test -z "$CXX"; then
    CXX=g++
@@ -96,7 +100,7 @@ esac
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
-DIRS="$SCRIPT_DIR/util $SCRIPT_DIR/db $SCRIPT_DIR/table"
+DIRS="util db table"
 set -f # temporarily disable globbing so that our patterns aren't expanded
 PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *_bench.cc -prune"
@@ -105,8 +109,8 @@ set +f # re-enable globbing

 # The sources consist of the portable files, plus the platform-specific port
 # file.
-echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> build_config.mk
-echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> build_config.mk
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
+echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT

 if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then
    # Cross-compiling; do not try any compilation tests.
@@ -147,10 +151,10 @@ fi
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"

-echo "PLATFORM=$PLATFORM" >> build_config.mk
-echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> build_config.mk
-echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> build_config.mk
-echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> build_config.mk
-echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> build_config.mk
-echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> build_config.mk
-echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> build_config.mk
+echo "PLATFORM=$PLATFORM" >> $OUTPUT
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
--- a/db/c.cc
+++ b/db/c.cc
@@ -10,6 +10,7 @@
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
 #include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
 #include "leveldb/iterator.h"
 #include "leveldb/options.h"
 #include "leveldb/status.h"
@@ -21,8 +22,10 @@ using leveldb::CompressionType;
 using leveldb::DB;
 using leveldb::Env;
 using leveldb::FileLock;
+using leveldb::FilterPolicy;
 using leveldb::Iterator;
 using leveldb::Logger;
+using leveldb::NewBloomFilterPolicy;
 using leveldb::NewLRUCache;
 using leveldb::Options;
 using leveldb::RandomAccessFile;
@@ -78,6 +81,47 @@ struct leveldb_comparator_t : public Comparator {
  virtual void FindShortSuccessor(std::string* key) const { }
 };

+struct leveldb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*create_)(
+      void*,
+      const char* const* key_array, const size_t* key_length_array,
+      int num_keys,
+      size_t* filter_length);
+  unsigned char (*key_match_)(
+      void*,
+      const char* key, size_t length,
+      const char* filter, size_t filter_length);
+
+  virtual ~leveldb_filterpolicy_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    std::vector<const char*> key_pointers(n);
+    std::vector<size_t> key_sizes(n);
+    for (int i = 0; i < n; i++) {
+      key_pointers[i] = keys[i].data();
+      key_sizes[i] = keys[i].size();
+    }
+    size_t len;
+    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+    dst->append(filter, len);
+    free(filter);
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    return (*key_match_)(state_, key.data(), key.size(),
+                         filter.data(), filter.size());
+  }
+};
+
 struct leveldb_env_t {
  Env* rep;
  bool is_default;
@@ -218,6 +262,17 @@ void leveldb_approximate_sizes(
  delete[] ranges;
 }

+void leveldb_compact_range(
+    leveldb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      // Pass NULL Slice if corresponding "const char*" is NULL
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
+}
+
 void leveldb_destroy_db(
    const leveldb_options_t* options,
    const char* name,
@@ -340,6 +395,12 @@ void leveldb_options_set_comparator(
  opt->rep.comparator = cmp;
 }

+void leveldb_options_set_filter_policy(
+    leveldb_options_t* opt,
+    leveldb_filterpolicy_t* policy) {
+  opt->rep.filter_policy = policy;
+}
+
 void leveldb_options_set_create_if_missing(
    leveldb_options_t* opt, unsigned char v) {
  opt->rep.create_if_missing = v;
@@ -407,6 +468,55 @@ void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
  delete cmp;
 }

+leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*)) {
+  leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_ = create_filter;
+  result->key_match_ = key_may_match;
+  result->name_ = name;
+  return result;
+}
+
+void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
+  // Make a leveldb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public leveldb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+  wrapper->state_ = NULL;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
 leveldb_readoptions_t* leveldb_readoptions_create() {
  return new leveldb_readoptions_t;
 }

--- a/db/c_test.c
+++ b/db/c_test.c
@@ -122,6 +122,31 @@ static const char* CmpName(void* arg) {
  return "foo";
 }

+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
 int main(int argc, char** argv) {
  leveldb_t* db;
  leveldb_comparator_t* cmp;
@@ -131,6 +156,7 @@ int main(int argc, char** argv) {
  leveldb_readoptions_t* roptions;
  leveldb_writeoptions_t* woptions;
  char* err = NULL;
+  int run = -1;

  snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d",
           ((int) geteuid()));
@@ -180,6 +206,14 @@ int main(int argc, char** argv) {
  CheckNoError(err);
  CheckGet(db, roptions, "foo", "hello");

+  StartPhase("compactall");
+  leveldb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  leveldb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
  StartPhase("writebatch");
  {
    leveldb_writebatch_t* wb = leveldb_writebatch_create();
@@ -279,6 +313,49 @@ int main(int argc, char** argv) {
    CheckGet(db, roptions, "foo", NULL);
    CheckGet(db, roptions, "bar", NULL);
    CheckGet(db, roptions, "box", "c");
+    leveldb_options_set_create_if_missing(options, 1);
+    leveldb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run < 2; run++) {
+    // First run uses custom filter, second run uses bloom filter
+    CheckNoError(err);
+    leveldb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = leveldb_filterpolicy_create(
+          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
+    } else {
+      policy = leveldb_filterpolicy_create_bloom(10);
+    }
+
+    // Create new database
+    leveldb_close(db);
+    leveldb_destroy_db(options, dbname, &err);
+    leveldb_options_set_filter_policy(options, policy);
+    db = leveldb_open(options, dbname, &err);
+    CheckNoError(err);
+    leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    leveldb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (phase == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+    leveldb_options_set_filter_policy(options, NULL);
+    leveldb_filterpolicy_destroy(policy);
  }

  StartPhase("cleanup");

--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -25,15 +25,20 @@
 //      overwrite     -- overwrite N values in random key order in async mode
 //      fillsync      -- write N/100 values in random key order in sync mode
 //      fill100K      -- write N/1000 100K values in random order in async mode
+//      deleteseq     -- delete N keys in sequential order
+//      deleterandom  -- delete N keys in random order
 //      readseq       -- read N times sequentially
 //      readreverse   -- read N times in reverse order
 //      readrandom    -- read N times in random order
+//      readmissing   -- read N missing keys in random order
 //      readhot       -- read N times in random order from 1% section of DB
+//      seekrandom    -- N random seeks
 //      crc32c        -- repeated crc32c of 4K of data
 //      acquireload   -- load N*1000 times
 //   Meta operations:
 //      compact     -- Compact the entire DB
 //      stats       -- Print DB stats
+//      sstables    -- Print sstable info
 //      heapprofile -- Dump a heap profile (if supported by this port)
 static const char* FLAGS_benchmarks =
    "fillseq,"
@@ -85,6 +90,10 @@ static int FLAGS_cache_size = -1;
 // Maximum number of files to keep open at the same time (use default if == 0)
 static int FLAGS_open_files = 0;

+// Bloom filter bits per key.
+// Negative means use default settings.
+static int FLAGS_bloom_bits = -1;
+
 // If true, do not destroy the existing database.  If you set this
 // flag and also specify a benchmark that wants a fresh database, that
 // benchmark will fail.
@@ -293,6 +302,7 @@ struct ThreadState {
 class Benchmark {
 private:
  Cache* cache_;
+  const FilterPolicy* filter_policy_;
  DB* db_;
  int num_;
  int value_size_;
@@ -378,6 +388,9 @@ class Benchmark {
 public:
  Benchmark()
  : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
+    filter_policy_(FLAGS_bloom_bits >= 0
+                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                   : NULL),
    db_(NULL),
    num_(FLAGS_num),
    value_size_(FLAGS_value_size),
@@ -399,6 +412,7 @@ class Benchmark {
  ~Benchmark() {
    delete db_;
    delete cache_;
+    delete filter_policy_;
  }

  void Run() {
@@ -457,11 +471,19 @@ class Benchmark {
        method = &Benchmark::ReadReverse;
      } else if (name == Slice("readrandom")) {
        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("readmissing")) {
+        method = &Benchmark::ReadMissing;
+      } else if (name == Slice("seekrandom")) {
+        method = &Benchmark::SeekRandom;
      } else if (name == Slice("readhot")) {
        method = &Benchmark::ReadHot;
      } else if (name == Slice("readrandomsmall")) {
        reads_ /= 1000;
        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("deleteseq")) {
+        method = &Benchmark::DeleteSeq;
+      } else if (name == Slice("deleterandom")) {
+        method = &Benchmark::DeleteRandom;
      } else if (name == Slice("readwhilewriting")) {
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileWriting;
@@ -478,7 +500,9 @@ class Benchmark {
      } else if (name == Slice("heapprofile")) {
        HeapProfile();
      } else if (name == Slice("stats")) {
-        PrintStats();
+        PrintStats("leveldb.stats");
+      } else if (name == Slice("sstables")) {
+        PrintStats("leveldb.sstables");
      } else {
        if (name != Slice()) {  // No error message for empty name
          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
@@ -669,6 +693,7 @@ class Benchmark {
    options.create_if_missing = !FLAGS_use_existing_db;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.filter_policy = filter_policy_;
    Status s = DB::Open(options, FLAGS_db, &db_);
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@@ -743,10 +768,28 @@ class Benchmark {
  void ReadRandom(ThreadState* thread) {
    ReadOptions options;
    std::string value;
+    int found = 0;
    for (int i = 0; i < reads_; i++) {
      char key[100];
      const int k = thread->rand.Next() % FLAGS_num;
      snprintf(key, sizeof(key), "%016d", k);
+      if (db_->Get(options, key, &value).ok()) {
+        found++;
+      }
+      thread->stats.FinishedSingleOp();
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void ReadMissing(ThreadState* thread) {
+    ReadOptions options;
+    std::string value;
+    for (int i = 0; i < reads_; i++) {
+      char key[100];
+      const int k = thread->rand.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d.", k);
      db_->Get(options, key, &value);
      thread->stats.FinishedSingleOp();
    }
@@ -765,6 +808,54 @@ class Benchmark {
    }
  }

+  void SeekRandom(ThreadState* thread) {
+    ReadOptions options;
+    std::string value;
+    int found = 0;
+    for (int i = 0; i < reads_; i++) {
+      Iterator* iter = db_->NewIterator(options);
+      char key[100];
+      const int k = thread->rand.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d", k);
+      iter->Seek(key);
+      if (iter->Valid() && iter->key() == key) found++;
+      delete iter;
+      thread->stats.FinishedSingleOp();
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void DoDelete(ThreadState* thread, bool seq) {
+    RandomGenerator gen;
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < num_; i += entries_per_batch_) {
+      batch.Clear();
+      for (int j = 0; j < entries_per_batch_; j++) {
+        const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+        char key[100];
+        snprintf(key, sizeof(key), "%016d", k);
+        batch.Delete(key);
+        thread->stats.FinishedSingleOp();
+      }
+      s = db_->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    }
+  }
+
+  void DeleteSeq(ThreadState* thread) {
+    DoDelete(thread, true);
+  }
+
+  void DeleteRandom(ThreadState* thread) {
+    DoDelete(thread, false);
+  }
+
  void ReadWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
@@ -799,9 +890,9 @@ class Benchmark {
    db_->CompactRange(NULL, NULL);
  }

-  void PrintStats() {
+  void PrintStats(const char* key) {
    std::string stats;
-    if (!db_->GetProperty("leveldb.stats", &stats)) {
+    if (!db_->GetProperty(key, &stats)) {
      stats = "(failed)";
    }
    fprintf(stdout, "\n%s\n", stats.c_str());
@@ -861,6 +952,8 @@ int main(int argc, char** argv) {
      FLAGS_write_buffer_size = n;
    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
      FLAGS_cache_size = n;
+    } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
+      FLAGS_bloom_bits = n;
    } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
      FLAGS_open_files = n;
    } else if (strncmp(argv[i], "--db=", 5) == 0) {

--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -87,12 +87,14 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
 }
 Options SanitizeOptions(const std::string& dbname,
                        const InternalKeyComparator* icmp,
+                        const InternalFilterPolicy* ipolicy,
                        const Options& src) {
  Options result = src;
  result.comparator = icmp;
-  ClipToRange(&result.max_open_files,           20,     50000);
-  ClipToRange(&result.write_buffer_size,        64<<10, 1<<30);
-  ClipToRange(&result.block_size,               1<<10,  4<<20);
+  result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
+  ClipToRange(&result.max_open_files,            20,     50000);
+  ClipToRange(&result.write_buffer_size,         64<<10, 1<<30);
+  ClipToRange(&result.block_size,                1<<10,  4<<20);
  if (result.info_log == NULL) {
    // Open a log file in the same directory as the db
    src.env->CreateDir(dbname);  // In case it does not exist
@@ -112,7 +114,9 @@ Options SanitizeOptions(const std::string& dbname,
 DBImpl::DBImpl(const Options& options, const std::string& dbname)
    : env_(options.env),
      internal_comparator_(options.comparator),
-      options_(SanitizeOptions(dbname, &internal_comparator_, options)),
+      internal_filter_policy_(options.filter_policy),
+      options_(SanitizeOptions(
+          dbname, &internal_comparator_, &internal_filter_policy_, options)),
      owns_info_log_(options_.info_log != options.info_log),
      owns_cache_(options_.block_cache != options.block_cache),
      dbname_(dbname),

--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -105,6 +105,7 @@ class DBImpl : public DB {
  // Constant after construction
  Env* const env_;
  const InternalKeyComparator internal_comparator_;
+  const InternalFilterPolicy internal_filter_policy_;
  const Options options_;  // options_.comparator == &internal_comparator_
  bool owns_info_log_;
  bool owns_cache_;
@@ -185,6 +186,7 @@ class DBImpl : public DB {
 // it is not equal to src.info_log.
 extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
+                               const InternalFilterPolicy* ipolicy,
                               const Options& src);

 }  // namespace leveldb

--- a/db/db_test.cc
+++ b/db/db_test.cc
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -98,6 +98,26 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
  }
 }

+const char* InternalFilterPolicy::Name() const {
+  return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                        std::string* dst) const {
+  // We rely on the fact that the code in table.cc does not mind us
+  // adjusting keys[].
+  Slice* mkey = const_cast<Slice*>(keys);
+  for (int i = 0; i < n; i++) {
+    mkey[i] = ExtractUserKey(keys[i]);
+    // TODO(sanjay): Suppress dups?
+  }
+  user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
 LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
  size_t usize = user_key.size();
  size_t needed = usize + 13;  // A conservative estimate

--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -8,6 +8,7 @@
 #include <stdio.h>
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
+#include "leveldb/filter_policy.h"
 #include "leveldb/slice.h"
 #include "leveldb/table_builder.h"
 #include "util/coding.h"
@@ -123,6 +124,17 @@ class InternalKeyComparator : public Comparator {
  int Compare(const InternalKey& a, const InternalKey& b) const;
 };

+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+  const FilterPolicy* const user_policy_;
+ public:
+  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+  virtual const char* Name() const;
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
 // Modules in this directory should keep internal keys wrapped inside
 // the following class instead of plain strings so that we do not
 // incorrectly use string comparisons instead of an InternalKeyComparator.

--- a/db/repair.cc
+++ b/db/repair.cc
@@ -48,7 +48,8 @@ class Repairer {
      : dbname_(dbname),
        env_(options.env),
        icmp_(options.comparator),
-        options_(SanitizeOptions(dbname, &icmp_, options)),
+        ipolicy_(options.filter_policy),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
        owns_info_log_(options_.info_log != options.info_log),
        owns_cache_(options_.block_cache != options.block_cache),
        next_file_number_(1) {
@@ -99,6 +100,7 @@ class Repairer {
  std::string const dbname_;
  Env* const env_;
  InternalKeyComparator const icmp_;
+  InternalFilterPolicy const ipolicy_;
  Options const options_;
  bool owns_info_log_;
  bool owns_cache_;

--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -42,23 +42,18 @@ TableCache::~TableCache() {
  delete cache_;
 }

-Iterator* TableCache::NewIterator(const ReadOptions& options,
-                                  uint64_t file_number,
-                                  uint64_t file_size,
-                                  Table** tableptr) {
-  if (tableptr != NULL) {
-    *tableptr = NULL;
-  }
-
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
+                             Cache::Handle** handle) {
+  Status s;
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);
  Slice key(buf, sizeof(buf));
-  Cache::Handle* handle = cache_->Lookup(key);
-  if (handle == NULL) {
+  *handle = cache_->Lookup(key);
+  if (*handle == NULL) {
    std::string fname = TableFileName(dbname_, file_number);
    RandomAccessFile* file = NULL;
    Table* table = NULL;
-    Status s = env_->NewRandomAccessFile(fname, &file);
+    s = env_->NewRandomAccessFile(fname, &file);
    if (s.ok()) {
      s = Table::Open(*options_, file, file_size, &table);
    }
@@ -68,13 +63,28 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
      delete file;
      // We do not cache error results so that if the error is transient,
      // or somebody repairs the file, we recover automatically.
-      return NewErrorIterator(s);
+    } else {
+      TableAndFile* tf = new TableAndFile;
+      tf->file = file;
+      tf->table = table;
+      *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
    }
+  }
+  return s;
+}

-    TableAndFile* tf = new TableAndFile;
-    tf->file = file;
-    tf->table = table;
-    handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  uint64_t file_number,
+                                  uint64_t file_size,
+                                  Table** tableptr) {
+  if (tableptr != NULL) {
+    *tableptr = NULL;
+  }
+
+  Cache::Handle* handle = NULL;
+  Status s = FindTable(file_number, file_size, &handle);
+  if (!s.ok()) {
+    return NewErrorIterator(s);
  }

  Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
@@ -86,6 +96,22 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  return result;
 }

+Status TableCache::Get(const ReadOptions& options,
+                       uint64_t file_number,
+                       uint64_t file_size,
+                       const Slice& k,
+                       void* arg,
+                       void (*saver)(void*, const Slice&, const Slice&)) {
+  Cache::Handle* handle = NULL;
+  Status s = FindTable(file_number, file_size, &handle);
+  if (s.ok()) {
+    Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+    s = t->InternalGet(options, k, arg, saver);
+    cache_->Release(handle);
+  }
+  return s;
+}
+
 void TableCache::Evict(uint64_t file_number) {
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);

--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -35,6 +35,15 @@ class TableCache {
                        uint64_t file_size,
                        Table** tableptr = NULL);

+  // If a seek to internal key "k" in specified file finds an entry,
+  // call (*handle_result)(arg, found_key, found_value).
+  Status Get(const ReadOptions& options,
+             uint64_t file_number,
+             uint64_t file_size,
+             const Slice& k,
+             void* arg,
+             void (*handle_result)(void*, const Slice&, const Slice&));
+
  // Evict any entry for the specified file number
  void Evict(uint64_t file_number);

@@ -43,6 +52,8 @@ class TableCache {
  const std::string dbname_;
  const Options* options_;
  Cache* cache_;
+
+  Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
 };

 }  // namespace leveldb

--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -255,35 +255,34 @@ void Version::AddIterators(const ReadOptions& options,
  }
 }

-// If "*iter" points at a value or deletion for user_key, store
-// either the value, or a NotFound error and return true.
-// Else return false.
-static bool GetValue(const Comparator* cmp,
-                     Iterator* iter, const Slice& user_key,
-                     std::string* value,
-                     Status* s) {
-  if (!iter->Valid()) {
-    return false;
-  }
+// Callback from TableCache::Get()
+namespace {
+enum SaverState {
+  kNotFound,
+  kFound,
+  kDeleted,
+  kCorrupt,
+};
+struct Saver {
+  SaverState state;
+  const Comparator* ucmp;
+  Slice user_key;
+  std::string* value;
+};
+}
+static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
  ParsedInternalKey parsed_key;
-  if (!ParseInternalKey(iter->key(), &parsed_key)) {
-    *s = Status::Corruption("corrupted key for ", user_key);
-    return true;
-  }
-  if (cmp->Compare(parsed_key.user_key, user_key) != 0) {
-    return false;
-  }
-  switch (parsed_key.type) {
-    case kTypeDeletion:
-      *s = Status::NotFound(Slice());  // Use an empty error message for speed
-      break;
-    case kTypeValue: {
-      Slice v = iter->value();
-      value->assign(v.data(), v.size());
-      break;
+  if (!ParseInternalKey(ikey, &parsed_key)) {
+    s->state = kCorrupt;
+  } else {
+    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
+      if (s->state == kFound) {
+        s->value->assign(v.data(), v.size());
+      }
    }
  }
-  return true;
 }

 static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
@@ -361,21 +360,27 @@ Status Version::Get(const ReadOptions& options,
      last_file_read = f;
      last_file_read_level = level;

-      Iterator* iter = vset_->table_cache_->NewIterator(
-          options,
-          f->number,
-          f->file_size);
-      iter->Seek(ikey);
-      const bool done = GetValue(ucmp, iter, user_key, value, &s);
-      if (!iter->status().ok()) {
-        s = iter->status();
-        delete iter;
+      Saver saver;
+      saver.state = kNotFound;
+      saver.ucmp = ucmp;
+      saver.user_key = user_key;
+      saver.value = value;
+      s = vset_->table_cache_->Get(options, f->number, f->file_size,
+                                   ikey, &saver, SaveValue);
+      if (!s.ok()) {
        return s;
-      } else {
-        delete iter;
-        if (done) {
+      }
+      switch (saver.state) {
+        case kNotFound:
+          break;      // Keep searching in other files
+        case kFound:
+          return s;
+        case kDeleted:
+          s = Status::NotFound(Slice());  // Use empty error message for speed
+          return s;
+        case kCorrupt:
+          s = Status::Corruption("corrupted key for ", user_key);
          return s;
-        }
      }
    }
  }

--- a/doc/index.html
+++ b/doc/index.html
@@ -400,6 +400,69 @@ We might want to prefix <code>filename</code> keys with one letter (say '/') and
 over just the metadata do not force us to fetch and cache bulky file
 contents.
 <p>
+<h2>Filters</h2>
+<p>
+Because of the way <code>leveldb</code> data is organized on disk,
+a single <code>Get()</code> call may involve multiple reads from disk.
+The optional <code>FilterPolicy</code> mechanism can be used to reduce
+the number of disk reads substantially.
+<pre>
+   leveldb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   leveldb::DB* db;
+   leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+</pre>
+The preceding code associates a
+<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
+based filtering policy with the database.  Bloom filter based
+filtering relies on keeping some number of bits of data in memory per
+key (in this case 10 bits per key since that is the argument we passed
+to NewBloomFilter).  This filter will reduce the number of unnecessary
+disk reads needed for <code>Get()</code> calls by a factor of
+approximately a 100.  Increasing the bits per key will lead to a
+larger reduction at the cost of more memory usage.  We recommend that
+applications whose working set does not fit in memory and that do a
+lot of random reads set a filter policy.
+<p>
+If you are using a custom comparator, you should ensure that the filter
+policy you are using is compatible with your comparator.  For example,
+consider a comparator that ignores trailing spaces when comparing keys.
+<code>NewBloomFilter</code> must not be used with such a comparator.
+Instead, the application should provide a custom filter policy that
+also ignores trailing spaces.  For example:
+<pre>
+  class CustomFilterPolicy : public leveldb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector&lt;Slice&gt; trimmed(n);
+      for (int i = 0; i &lt; n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+</pre>
+<p>
+Advanced applications may provide a filter policy that does not use
+a bloom filter but uses some other mechanism for summarizing a set
+of keys.  See <code>leveldb/filter_policy.h</code> for detail.
+<p>
 <h1>Checksums</h1>
 <p>
 <code>leveldb</code> associates checksums with all data it stores in the file system.

--- a/doc/table_format.txt
+++ b/doc/table_format.txt
@@ -47,6 +47,47 @@ the BlockHandle of the metaindex and index blocks as well as a magic number.
       			 	       // (40==2*BlockHandle::kMaxEncodedLength)
       magic:		       fixed64;    // == 0xdb4775248b80fb57

+"filter" Meta Block
+-------------------
+
+If a "FilterPolicy" was specified when the database was opened, a
+filter block is stored in each table.  The "metaindex" block contains
+an entry that maps from "filter.<N>" to the BlockHandle for the filter
+block where "<N>" is the string returned by the filter policy's
+"Name()" method.
+
+The filter block stores a sequence of filters, where filter i contains
+the output of FilterPolicy::CreateFilter() on all keys that are stored
+in a block whose file offset falls within the range
+
+    [ i*base ... (i+1)*base-1 ]
+
+Currently, "base" is 2KB.  So for example, if blocks X and Y start in
+the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
+converted to a filter by calling FilterPolicy::CreateFilter(), and the
+resulting filter will be stored as the first filter in the filter
+block.
+
+The filter block is formatted as follows:
+
+     [filter 0]
+     [filter 1]
+     [filter 2]
+     ...
+     [filter N-1]
+
+     [offset of filter 0]                  : 4 bytes
+     [offset of filter 1]                  : 4 bytes
+     [offset of filter 2]                  : 4 bytes
+     ...
+     [offset of filter N-1]                : 4 bytes
+
+     [offset of beginning of offset array] : 4 bytes
+     lg(base)                              : 1 byte
+
+The offset array at the end of the filter block allows efficient
+mapping from a data block offset to the corresponding filter.
+
 "stats" Meta Block
 ------------------


--- a/include/leveldb/c.h
+++ b/include/leveldb/c.h
@@ -55,6 +55,7 @@ typedef struct leveldb_cache_t         leveldb_cache_t;
 typedef struct leveldb_comparator_t    leveldb_comparator_t;
 typedef struct leveldb_env_t           leveldb_env_t;
 typedef struct leveldb_filelock_t      leveldb_filelock_t;
+typedef struct leveldb_filterpolicy_t  leveldb_filterpolicy_t;
 typedef struct leveldb_iterator_t      leveldb_iterator_t;
 typedef struct leveldb_logger_t        leveldb_logger_t;
 typedef struct leveldb_options_t       leveldb_options_t;
@@ -127,6 +128,11 @@ extern void leveldb_approximate_sizes(
    const char* const* range_limit_key, const size_t* range_limit_key_len,
    uint64_t* sizes);

+extern void leveldb_compact_range(
+    leveldb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
 /* Management operations */

 extern void leveldb_destroy_db(
@@ -177,6 +183,9 @@ extern void leveldb_options_destroy(leveldb_options_t*);
 extern void leveldb_options_set_comparator(
    leveldb_options_t*,
    leveldb_comparator_t*);
+extern void leveldb_options_set_filter_policy(
+    leveldb_options_t*,
+    leveldb_filterpolicy_t*);
 extern void leveldb_options_set_create_if_missing(
    leveldb_options_t*, unsigned char);
 extern void leveldb_options_set_error_if_exists(
@@ -209,6 +218,26 @@ extern leveldb_comparator_t* leveldb_comparator_create(
    const char* (*name)(void*));
 extern void leveldb_comparator_destroy(leveldb_comparator_t*);

+/* Filter policy */
+
+extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*));
+extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
+
+extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
+    int bits_per_key);
+
 /* Read options */

 extern leveldb_readoptions_t* leveldb_readoptions_create();

--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@@ -14,7 +14,7 @@ namespace leveldb {

 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 3;
+static const int kMinorVersion = 4;

 struct Options;
 struct ReadOptions;

--- a/include/leveldb/filter_policy.h
+++ b/include/leveldb/filter_policy.h
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in leveldb and are consulted
+// automatically by leveldb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
+
+#include <string>
+
+namespace leveldb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+  virtual ~FilterPolicy();
+
+  // Return the name of this policy.  Note that if the filter encoding
+  // changes in an incompatible way, the name returned by this method
+  // must be changed.  Otherwise, old incompatible filters may be
+  // passed to methods of this type.
+  virtual const char* Name() const = 0;
+
+  // keys[0,n-1] contains a list of keys (potentially with duplicates)
+  // that are ordered according to the user supplied comparator.
+  // Append a filter that summarizes keys[0,n-1] to *dst.
+  //
+  // Warning: do not change the initial contents of *dst.  Instead,
+  // append the newly constructed filter to *dst.
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+      const = 0;
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilter() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.  A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
--- a/include/leveldb/options.h
+++ b/include/leveldb/options.h
@@ -12,6 +12,7 @@ namespace leveldb {
 class Cache;
 class Comparator;
 class Env;
+class FilterPolicy;
 class Logger;
 class Snapshot;

@@ -127,6 +128,13 @@ struct Options {
  // efficiently detect that and will switch to uncompressed mode.
  CompressionType compression;

+  // If non-NULL, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: NULL
+  const FilterPolicy* filter_policy;
+
  // Create an Options object with default values for all fields.
  Options();
 };

--- a/include/leveldb/table.h
+++ b/include/leveldb/table.h
@@ -12,9 +12,11 @@ namespace leveldb {

 class Block;
 class BlockHandle;
+class Footer;
 struct Options;
 class RandomAccessFile;
 struct ReadOptions;
+class TableCache;

 // A Table is a sorted map from strings to strings.  Tables are
 // immutable and persistent.  A Table may be safely accessed from
@@ -60,6 +62,19 @@ class Table {
  explicit Table(Rep* rep) { rep_ = rep; }
  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);

+  // Calls (*handle_result)(arg, ...) with the entry found after a call
+  // to Seek(key).  May not make such a call if filter policy says
+  // that key is not present.
+  friend class TableCache;
+  Status InternalGet(
+      const ReadOptions&, const Slice& key,
+      void* arg,
+      void (*handle_result)(void* arg, const Slice& k, const Slice& v));
+
+
+  void ReadMeta(const Footer& footer);
+  void ReadFilter(const Slice& filter_handle_value);
+
  // No copying allowed
  Table(const Table&);
  void operator=(const Table&);

--- a/include/leveldb/table_builder.h
+++ b/include/leveldb/table_builder.h
@@ -77,6 +77,7 @@ class TableBuilder {
 private:
  bool ok() const { return status().ok(); }
  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
+  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);

  struct Rep;
  Rep* rep_;

--- a/port/port_android.h
+++ b/port/port_android.h
@@ -78,6 +78,9 @@ class CondVar {
 // On ARM chipsets <V6, 0xffff0fa0 is the hard coded address of a 
 // memory barrier function provided by the kernel.
 typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+// TODO(user): ATTRIBUTE_WEAK is undefined, so this fails to build on
+// non-ARMV6_OR_7. We may be able to replace it with __attribute__((weak)) for
+// older ARM builds, but x86 builds will require a different memory barrier.
 LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK =
    (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
 #endif

--- a/table/block.cc
+++ b/table/block.cc
@@ -9,6 +9,7 @@
 #include <vector>
 #include <algorithm>
 #include "leveldb/comparator.h"
+#include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"

@@ -19,10 +20,10 @@ inline uint32_t Block::NumRestarts() const {
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }

-Block::Block(const char* data, size_t size, bool take_ownership)
-    : data_(data),
-      size_(size),
-      owned_(take_ownership) {
+Block::Block(const BlockContents& contents)
+    : data_(contents.data.data()),
+      size_(contents.data.size()),
+      owned_(contents.heap_allocated) {
  if (size_ < sizeof(uint32_t)) {
    size_ = 0;  // Error marker
  } else {

--- a/table/block.h
+++ b/table/block.h
@@ -11,14 +11,13 @@

 namespace leveldb {

+struct BlockContents;
 class Comparator;

 class Block {
 public:
  // Initialize the block with the specified contents.
-  // Takes ownership of data[] and will delete[] it when done iff
-  // "take_ownership is true.
-  Block(const char* data, size_t size, bool take_ownership);
+  explicit Block(const BlockContents& contents);

  ~Block();


--- a/table/filter_block.cc
+++ b/table/filter_block.cc
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/filter_block.h"
+
+#include "leveldb/filter_policy.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// See doc/table_format.txt for an explanation of the filter block format.
+
+// Generate new filter every 2KB of data
+static const size_t kFilterBaseLg = 11;
+static const size_t kFilterBase = 1 << kFilterBaseLg;
+
+FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy)
+    : policy_(policy) {
+}
+
+void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
+  uint64_t filter_index = (block_offset / kFilterBase);
+  assert(filter_index >= filter_offsets_.size());
+  while (filter_index > filter_offsets_.size()) {
+    GenerateFilter();
+  }
+}
+
+void FilterBlockBuilder::AddKey(const Slice& key) {
+  Slice k = key;
+  start_.push_back(keys_.size());
+  keys_.append(k.data(), k.size());
+}
+
+Slice FilterBlockBuilder::Finish() {
+  if (!start_.empty()) {
+    GenerateFilter();
+  }
+
+  // Append array of per-filter offsets
+  const uint32_t array_offset = result_.size();
+  for (size_t i = 0; i < filter_offsets_.size(); i++) {
+    PutFixed32(&result_, filter_offsets_[i]);
+  }
+
+  PutFixed32(&result_, array_offset);
+  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
+  return Slice(result_);
+}
+
+void FilterBlockBuilder::GenerateFilter() {
+  const size_t num_keys = start_.size();
+  if (num_keys == 0) {
+    // Fast path if there are no keys for this filter
+    filter_offsets_.push_back(result_.size());
+    return;
+  }
+
+  // Make list of keys from flattened key structure
+  start_.push_back(keys_.size());  // Simplify length computation
+  tmp_keys_.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    const char* base = keys_.data() + start_[i];
+    size_t length = start_[i+1] - start_[i];
+    tmp_keys_[i] = Slice(base, length);
+  }
+
+  // Generate filter for current set of keys and append to result_.
+  filter_offsets_.push_back(result_.size());
+  policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
+
+  tmp_keys_.clear();
+  keys_.clear();
+  start_.clear();
+}
+
+FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
+                                     const Slice& contents)
+    : policy_(policy),
+      data_(NULL),
+      offset_(NULL),
+      num_(0),
+      base_lg_(0) {
+  size_t n = contents.size();
+  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
+  base_lg_ = contents[n-1];
+  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
+  if (last_word > n - 5) return;
+  data_ = contents.data();
+  offset_ = data_ + last_word;
+  num_ = (n - 5 - last_word) / 4;
+}
+
+bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
+  uint64_t index = block_offset >> base_lg_;
+  if (index < num_) {
+    uint32_t start = DecodeFixed32(offset_ + index*4);
+    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
+    if (start <= limit && limit <= (offset_ - data_)) {
+      Slice filter = Slice(data_ + start, limit - start);
+      return policy_->KeyMayMatch(key, filter);
+    } else if (start == limit) {
+      // Empty filters do not match any keys
+      return false;
+    }
+  }
+  return true;  // Errors are treated as potential matches
+}
+
+}
--- a/table/filter_block.h
+++ b/table/filter_block.h
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#ifndef STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
+#define STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include "leveldb/slice.h"
+#include "util/hash.h"
+
+namespace leveldb {
+
+class FilterPolicy;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+//      (StartBlock AddKey*)* Finish
+class FilterBlockBuilder {
+ public:
+  explicit FilterBlockBuilder(const FilterPolicy*);
+
+  void StartBlock(uint64_t block_offset);
+  void AddKey(const Slice& key);
+  Slice Finish();
+
+ private:
+  void GenerateFilter();
+
+  const FilterPolicy* policy_;
+  std::string keys_;              // Flattened key contents
+  std::vector<size_t> start_;     // Starting index in keys_ of each key
+  std::string result_;            // Filter data computed so far
+  std::vector<Slice> tmp_keys_;   // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&);
+  void operator=(const FilterBlockBuilder&);
+};
+
+class FilterBlockReader {
+ public:
+ // REQUIRES: "contents" and *policy must stay live while *this is live.
+  FilterBlockReader(const FilterPolicy* policy, const Slice& contents);
+  bool KeyMayMatch(uint64_t block_offset, const Slice& key);
+
+ private:
+  const FilterPolicy* policy_;
+  const char* data_;    // Pointer to filter data (at block-start)
+  const char* offset_;  // Pointer to beginning of offset array (at block-end)
+  size_t num_;          // Number of entries in offset array
+  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/filter_block.h"
+
+#include "leveldb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const {
+    return "TestHashFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class FilterBlockTest {
+ public:
+  TestHashFilter policy_;
+};
+
+TEST(FilterBlockTest, EmptyBuilder) {
+  FilterBlockBuilder builder(&policy_);
+  Slice block = builder.Finish();
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  FilterBlockReader reader(&policy_, block);
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
+}
+
+TEST(FilterBlockTest, SingleChunk) {
+  FilterBlockBuilder builder(&policy_);
+  builder.StartBlock(100);
+  builder.AddKey("foo");
+  builder.AddKey("bar");
+  builder.AddKey("box");
+  builder.StartBlock(200);
+  builder.AddKey("box");
+  builder.StartBlock(300);
+  builder.AddKey("hello");
+  Slice block = builder.Finish();
+  FilterBlockReader reader(&policy_, block);
+  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
+  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
+  ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
+}
+
+TEST(FilterBlockTest, MultiChunk) {
+  FilterBlockBuilder builder(&policy_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.AddKey("foo");
+  builder.StartBlock(2000);
+  builder.AddKey("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.AddKey("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.AddKey("box");
+  builder.AddKey("hello");
+
+  Slice block = builder.Finish();
+  FilterBlockReader reader(&policy_, block);
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
+  ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
+  ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
+  ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/table/format.cc
+++ b/table/format.cc
@@ -66,10 +66,10 @@ Status Footer::DecodeFrom(Slice* input) {
 Status ReadBlock(RandomAccessFile* file,
                 const ReadOptions& options,
                 const BlockHandle& handle,
-                 Block** block,
-                 bool* may_cache) {
-  *block = NULL;
-  *may_cache = false;
+                 BlockContents* result) {
+  result->data = Slice();
+  result->cachable = false;
+  result->heap_allocated = false;

  // Read the block contents as well as the type/crc footer.
  // See table_builder.cc for the code that built this structure.
@@ -105,11 +105,13 @@ Status ReadBlock(RandomAccessFile* file,
        // Use it directly under the assumption that it will be live
        // while the file is open.
        delete[] buf;
-        *block = new Block(data, n, false /* do not take ownership */);
-        *may_cache = false;  // Do not double-cache
+        result->data = Slice(data, n);
+        result->heap_allocated = false;
+        result->cachable = false;  // Do not double-cache
      } else {
-        *block = new Block(buf, n, true /* take ownership */);
-        *may_cache = true;
+        result->data = Slice(buf, n);
+        result->heap_allocated = true;
+        result->cachable = true;
      }

      // Ok
@@ -127,8 +129,9 @@ Status ReadBlock(RandomAccessFile* file,
        return Status::Corruption("corrupted compressed block contents");
      }
      delete[] buf;
-      *block = new Block(ubuf, ulength, true /* take ownership */);
-      *may_cache = true;
+      result->data = Slice(ubuf, ulength);
+      result->heap_allocated = true;
+      result->cachable = true;
      break;
    }
    default:

--- a/table/format.h
+++ b/table/format.h
@@ -83,16 +83,18 @@ static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;

-// Read the block identified by "handle" from "file".  On success,
-// store a pointer to the heap-allocated result in *block and return
-// OK.  On failure store NULL in *block and return non-OK.
-// On success, stores true in *may_cache if the result may be
-// cached, false if it must not be cached.
+struct BlockContents {
+  Slice data;           // Actual contents of data
+  bool cachable;        // True iff data can be cached
+  bool heap_allocated;  // True iff caller should delete[] data.data()
+};
+
+// Read the block identified by "handle" from "file".  On failure
+// return non-OK.  On success fill *result and return OK.
 extern Status ReadBlock(RandomAccessFile* file,
                        const ReadOptions& options,
                        const BlockHandle& handle,
-                        Block** block,
-                        bool* may_cache);
+                        BlockContents* result);

 // Implementation details follow.  Clients should ignore,


--- a/table/table.cc
+++ b/table/table.cc
@@ -5,8 +5,12 @@
 #include "leveldb/table.h"

 #include "leveldb/cache.h"
+#include "leveldb/comparator.h"
 #include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
 #include "table/block.h"
+#include "table/filter_block.h"
 #include "table/format.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
@@ -15,6 +19,8 @@ namespace leveldb {

 struct Table::Rep {
  ~Rep() {
+    delete filter;
+    delete [] filter_data;
    delete index_block;
  }

@@ -22,6 +28,8 @@ struct Table::Rep {
  Status status;
  RandomAccessFile* file;
  uint64_t cache_id;
+  FilterBlockReader* filter;
+  const char* filter_data;

  BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
  Block* index_block;
@@ -47,11 +55,13 @@ Status Table::Open(const Options& options,
  if (!s.ok()) return s;

  // Read the index block
+  BlockContents contents;
  Block* index_block = NULL;
  if (s.ok()) {
-    bool may_cache;  // Ignored result
-    s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block,
-                  &may_cache);
+    s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents);
+    if (s.ok()) {
+      index_block = new Block(contents);
+    }
  }

  if (s.ok()) {
@@ -63,7 +73,10 @@ Status Table::Open(const Options& options,
    rep->metaindex_handle = footer.metaindex_handle();
    rep->index_block = index_block;
    rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
+    rep->filter_data = NULL;
+    rep->filter = NULL;
    *table = new Table(rep);
+    (*table)->ReadMeta(footer);
  } else {
    if (index_block) delete index_block;
  }
@@ -71,6 +84,52 @@ Status Table::Open(const Options& options,
  return s;
 }

+void Table::ReadMeta(const Footer& footer) {
+  if (rep_->options.filter_policy == NULL) {
+    return;  // Do not need any metadata
+  }
+
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  ReadOptions opt;
+  BlockContents contents;
+  if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
+    // Do not propagate errors since meta info is not needed for operation
+    return;
+  }
+  Block* meta = new Block(contents);
+
+  Iterator* iter = meta->NewIterator(BytewiseComparator());
+  std::string key = "filter.";
+  key.append(rep_->options.filter_policy->Name());
+  iter->Seek(key);
+  if (iter->Valid() && iter->key() == Slice(key)) {
+    ReadFilter(iter->value());
+  }
+  delete iter;
+  delete meta;
+}
+
+void Table::ReadFilter(const Slice& filter_handle_value) {
+  Slice v = filter_handle_value;
+  BlockHandle filter_handle;
+  if (!filter_handle.DecodeFrom(&v).ok()) {
+    return;
+  }
+
+  // We might want to unify with ReadBlock() if we start
+  // requiring checksum verification in Table::Open.
+  ReadOptions opt;
+  BlockContents block;
+  if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
+    return;
+  }
+  if (block.heap_allocated) {
+    rep_->filter_data = block.data.data();     // Will need to delete later
+  }
+  rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
+}
+
 Table::~Table() {
  delete rep_;
 }
@@ -107,7 +166,7 @@ Iterator* Table::BlockReader(void* arg,
  // can add more features in the future.

  if (s.ok()) {
-    bool may_cache;
+    BlockContents contents;
    if (block_cache != NULL) {
      char cache_key_buffer[16];
      EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
@@ -117,14 +176,20 @@ Iterator* Table::BlockReader(void* arg,
      if (cache_handle != NULL) {
        block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
      } else {
-        s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache);
-        if (s.ok() && may_cache && options.fill_cache) {
-          cache_handle = block_cache->Insert(
-              key, block, block->size(), &DeleteCachedBlock);
+        s = ReadBlock(table->rep_->file, options, handle, &contents);
+        if (s.ok()) {
+          block = new Block(contents);
+          if (contents.cachable && options.fill_cache) {
+            cache_handle = block_cache->Insert(
+                key, block, block->size(), &DeleteCachedBlock);
+          }
        }
      }
    } else {
-      s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache);
+      s = ReadBlock(table->rep_->file, options, handle, &contents);
+      if (s.ok()) {
+        block = new Block(contents);
+      }
    }
  }

@@ -148,6 +213,39 @@ Iterator* Table::NewIterator(const ReadOptions& options) const {
      &Table::BlockReader, const_cast<Table*>(this), options);
 }

+Status Table::InternalGet(const ReadOptions& options, const Slice& k,
+                          void* arg,
+                          void (*saver)(void*, const Slice&, const Slice&)) {
+  Status s;
+  Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
+  iiter->Seek(k);
+  if (iiter->Valid()) {
+    Slice handle_value = iiter->value();
+    FilterBlockReader* filter = rep_->filter;
+    BlockHandle handle;
+    if (filter != NULL &&
+        handle.DecodeFrom(&handle_value).ok() &&
+        !filter->KeyMayMatch(handle.offset(), k)) {
+      // Not found
+    } else {
+      Slice handle = iiter->value();
+      Iterator* block_iter = BlockReader(this, options, iiter->value());
+      block_iter->Seek(k);
+      if (block_iter->Valid()) {
+        (*saver)(arg, block_iter->key(), block_iter->value());
+      }
+      s = block_iter->status();
+      delete block_iter;
+    }
+  }
+  if (s.ok()) {
+    s = iiter->status();
+  }
+  delete iiter;
+  return s;
+}
+
+
 uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
  Iterator* index_iter =
      rep_->index_block->NewIterator(rep_->options.comparator);

--- a/table/table_builder.cc
+++ b/table/table_builder.cc
@@ -5,14 +5,15 @@
 #include "leveldb/table_builder.h"

 #include <assert.h>
-#include <stdio.h>
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
 #include "table/block_builder.h"
+#include "table/filter_block.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/logging.h"

 namespace leveldb {

@@ -27,6 +28,7 @@ struct TableBuilder::Rep {
  std::string last_key;
  int64_t num_entries;
  bool closed;          // Either Finish() or Abandon() has been called.
+  FilterBlockBuilder* filter_block;

  // We do not emit the index entry for a block until we have seen the
  // first key for the next data block.  This allows us to use shorter
@@ -51,6 +53,8 @@ struct TableBuilder::Rep {
        index_block(&index_block_options),
        num_entries(0),
        closed(false),
+        filter_block(opt.filter_policy == NULL ? NULL
+                     : new FilterBlockBuilder(opt.filter_policy)),
        pending_index_entry(false) {
    index_block_options.block_restart_interval = 1;
  }
@@ -58,10 +62,14 @@ struct TableBuilder::Rep {

 TableBuilder::TableBuilder(const Options& options, WritableFile* file)
    : rep_(new Rep(options, file)) {
+  if (rep_->filter_block != NULL) {
+    rep_->filter_block->StartBlock(0);
+  }
 }

 TableBuilder::~TableBuilder() {
  assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
+  delete rep_->filter_block;
  delete rep_;
 }

@@ -98,6 +106,10 @@ void TableBuilder::Add(const Slice& key, const Slice& value) {
    r->pending_index_entry = false;
  }

+  if (r->filter_block != NULL) {
+    r->filter_block->AddKey(key);
+  }
+
  r->last_key.assign(key.data(), key.size());
  r->num_entries++;
  r->data_block.Add(key, value);
@@ -119,6 +131,9 @@ void TableBuilder::Flush() {
    r->pending_index_entry = true;
    r->status = r->file->Flush();
  }
+  if (r->filter_block != NULL) {
+    r->filter_block->StartBlock(r->offset);
+  }
 }

 void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
@@ -152,6 +167,15 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
      break;
    }
  }
+  WriteRawBlock(block_contents, type, handle);
+  r->compressed_output.clear();
+  block->Reset();
+}
+
+void TableBuilder::WriteRawBlock(const Slice& block_contents,
+                                 CompressionType type,
+                                 BlockHandle* handle) {
+  Rep* r = rep_;
  handle->set_offset(r->offset);
  handle->set_size(block_contents.size());
  r->status = r->file->Append(block_contents);
@@ -166,8 +190,6 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
      r->offset += block_contents.size() + kBlockTrailerSize;
    }
  }
-  r->compressed_output.clear();
-  block->Reset();
 }

 Status TableBuilder::status() const {
@@ -179,13 +201,32 @@ Status TableBuilder::Finish() {
  Flush();
  assert(!r->closed);
  r->closed = true;
-  BlockHandle metaindex_block_handle;
-  BlockHandle index_block_handle;
+
+  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
+
+  // Write filter block
+  if (ok() && r->filter_block != NULL) {
+    WriteRawBlock(r->filter_block->Finish(), kNoCompression,
+                  &filter_block_handle);
+  }
+
+  // Write metaindex block
  if (ok()) {
    BlockBuilder meta_index_block(&r->options);
+    if (r->filter_block != NULL) {
+      // Add mapping from "filter.Name" to location of filter data
+      std::string key = "filter.";
+      key.append(r->options.filter_policy->Name());
+      std::string handle_encoding;
+      filter_block_handle.EncodeTo(&handle_encoding);
+      meta_index_block.Add(key, handle_encoding);
+    }
+
    // TODO(postrelease): Add stats and other meta blocks
    WriteBlock(&meta_index_block, &metaindex_block_handle);
  }
+
+  // Write index block
  if (ok()) {
    if (r->pending_index_entry) {
      r->options.comparator->FindShortSuccessor(&r->last_key);
@@ -196,6 +237,8 @@ Status TableBuilder::Finish() {
    }
    WriteBlock(&r->index_block, &index_block_handle);
  }
+
+  // Write footer
  if (ok()) {
    Footer footer;
    footer.set_metaindex_handle(metaindex_block_handle);

--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -168,8 +168,6 @@ class Constructor {
  // Construct the data structure from the data in "data"
  virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;

-  virtual size_t NumBytes() const = 0;
-
  virtual Iterator* NewIterator() const = 0;

  virtual const KVMap& data() { return data_; }
@@ -185,7 +183,6 @@ class BlockConstructor: public Constructor {
  explicit BlockConstructor(const Comparator* cmp)
      : Constructor(cmp),
        comparator_(cmp),
-        block_size_(-1),
        block_(NULL) { }
  ~BlockConstructor() {
    delete block_;
@@ -201,22 +198,21 @@ class BlockConstructor: public Constructor {
      builder.Add(it->first, it->second);
    }
    // Open the block
-    Slice block_data = builder.Finish();
-    block_size_ = block_data.size();
-    char* block_data_copy = new char[block_size_];
-    memcpy(block_data_copy, block_data.data(), block_size_);
-    block_ = new Block(block_data_copy, block_size_, true /* take ownership */);
+    data_ = builder.Finish().ToString();
+    BlockContents contents;
+    contents.data = data_;
+    contents.cachable = false;
+    contents.heap_allocated = false;
+    block_ = new Block(contents);
    return Status::OK();
  }
-  virtual size_t NumBytes() const { return block_size_; }
-
  virtual Iterator* NewIterator() const {
    return block_->NewIterator(comparator_);
  }

 private:
  const Comparator* comparator_;
-  int block_size_;
+  std::string data_;
  Block* block_;

  BlockConstructor();
@@ -253,7 +249,6 @@ class TableConstructor: public Constructor {
    table_options.comparator = options.comparator;
    return Table::Open(table_options, source_, sink.contents().size(), &table_);
  }
-  virtual size_t NumBytes() const { return source_->Size(); }

  virtual Iterator* NewIterator() const {
    return table_->NewIterator(ReadOptions());
@@ -342,10 +337,6 @@ class MemTableConstructor: public Constructor {
    }
    return Status::OK();
  }
-  virtual size_t NumBytes() const {
-    return memtable_->ApproximateMemoryUsage();
-  }
-
  virtual Iterator* NewIterator() const {
    return new KeyConvertingIterator(memtable_->NewIterator());
  }
@@ -379,13 +370,6 @@ class DBConstructor: public Constructor {
    }
    return Status::OK();
  }
-  virtual size_t NumBytes() const {
-    Range r("", "\xff\xff");
-    uint64_t size;
-    db_->GetApproximateSizes(&r, 1, &size);
-    return size;
-  }
-
  virtual Iterator* NewIterator() const {
    return db_->NewIterator(ReadOptions());
  }
@@ -809,7 +793,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"),  210000, 211000));
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"),  510000, 511000));
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"),  510000, 511000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),  610000, 611000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),  610000, 612000));

 }


--- a/util/bloom.cc
+++ b/util/bloom.cc
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/filter_policy.h"
+
+#include "leveldb/slice.h"
+#include "util/hash.h"
+
+namespace leveldb {
+
+namespace {
+static uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+class BloomFilterPolicy : public FilterPolicy {
+ private:
+  size_t bits_per_key_;
+  size_t k_;
+
+ public:
+  explicit BloomFilterPolicy(int bits_per_key)
+      : bits_per_key_(bits_per_key) {
+    // We intentionally round down to reduce probing cost a little bit
+    k_ = static_cast<size_t>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
+    if (k_ < 1) k_ = 1;
+    if (k_ > 30) k_ = 30;
+  }
+
+  virtual const char* Name() const {
+    return "leveldb.BuiltinBloomFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    // Compute bloom filter size (in both bits and bytes)
+    size_t bits = n * bits_per_key_;
+
+    // For small n, we can see a very high false positive rate.  Fix it
+    // by enforcing a minimum bloom filter length.
+    if (bits < 64) bits = 64;
+
+    size_t bytes = (bits + 7) / 8;
+    bits = bytes * 8;
+
+    const size_t init_size = dst->size();
+    dst->resize(init_size + bytes, 0);
+    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
+    char* array = &(*dst)[init_size];
+    for (size_t i = 0; i < n; i++) {
+      // Use double-hashing to generate a sequence of hash values.
+      // See analysis in [Kirsch,Mitzenmacher 2006].
+      uint32_t h = BloomHash(keys[i]);
+      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+      for (size_t j = 0; j < k_; j++) {
+        const uint32_t bitpos = h % bits;
+        array[bitpos/8] |= (1 << (bitpos % 8));
+        h += delta;
+      }
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
+    const size_t len = bloom_filter.size();
+    if (len < 2) return false;
+
+    const char* array = bloom_filter.data();
+    const size_t bits = (len - 1) * 8;
+
+    // Use the encoded k so that we can read filters generated by
+    // bloom filters created using different parameters.
+    const size_t k = array[len-1];
+    if (k > 30) {
+      // Reserved for potentially new encodings for short bloom filters.
+      // Consider it a match.
+      return true;
+    }
+
+    uint32_t h = BloomHash(key);
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (size_t j = 0; j < k; j++) {
+      const uint32_t bitpos = h % bits;
+      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
+      h += delta;
+    }
+    return true;
+  }
+};
+}
+
+const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
+  return new BloomFilterPolicy(bits_per_key);
+}
+
+}  // namespace leveldb
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/filter_policy.h"
+
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kVerbose = 1;
+
+static Slice Key(int i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class BloomTest {
+ private:
+  const FilterPolicy* policy_;
+  std::string filter_;
+  std::vector<std::string> keys_;
+
+ public:
+  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
+
+  ~BloomTest() {
+    delete policy_;
+  }
+
+  void Reset() {
+    keys_.clear();
+    filter_.clear();
+  }
+
+  void Add(const Slice& s) {
+    keys_.push_back(s.ToString());
+  }
+
+  void Build() {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_.size(); i++) {
+      key_slices.push_back(Slice(keys_[i]));
+    }
+    filter_.clear();
+    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
+    keys_.clear();
+    if (kVerbose >= 2) DumpFilter();
+  }
+
+  size_t FilterSize() const {
+    return filter_.size();
+  }
+
+  void DumpFilter() {
+    fprintf(stderr, "F(");
+    for (size_t i = 0; i+1 < filter_.size(); i++) {
+      const unsigned int c = static_cast<unsigned int>(filter_[i]);
+      for (int j = 0; j < 8; j++) {
+        fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
+      }
+    }
+    fprintf(stderr, ")\n");
+  }
+
+  bool Matches(const Slice& s) {
+    if (!keys_.empty()) {
+      Build();
+    }
+    return policy_->KeyMayMatch(s, filter_);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST(BloomTest, EmptyFilter) {
+  ASSERT_TRUE(! Matches("hello"));
+  ASSERT_TRUE(! Matches("world"));
+}
+
+TEST(BloomTest, Small) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(! Matches("x"));
+  ASSERT_TRUE(! Matches("foo"));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+TEST(BloomTest, VaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
+    else good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Different bits-per-byte
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/util/filter_policy.cc
+++ b/util/filter_policy.cc
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/filter_policy.h"
+
+namespace leveldb {
+
+FilterPolicy::~FilterPolicy() { }
+
+}  // namespace leveldb
--- a/util/options.cc
+++ b/util/options.cc
@@ -21,7 +21,8 @@ Options::Options()
      block_cache(NULL),
      block_size(4096),
      block_restart_interval(16),
-      compression(kSnappyCompression) {
+      compression(kSnappyCompression),
+      filter_policy(NULL) {
 }