diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 85c1db059d8e5e0503ca0e6534e2934ac12098b5..b7eaff37dc82ad437d4bd30e834a5e0fe61fab20 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -44,15 +44,15 @@ using std::shared_ptr;
 enum CompressionType : char {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
-  kNoCompression     = 0x0,
+  kNoCompression = 0x0,
   kSnappyCompression = 0x1,
   kZlibCompression = 0x2,
   kBZip2Compression = 0x3
 };
 
 enum CompactionStyle : char {
-  kCompactionStyleLevel       = 0x0, // level based compaction style
-  kCompactionStyleUniversal   = 0x1  // Universal compaction style
+  kCompactionStyleLevel = 0x0,     // level based compaction style
+  kCompactionStyleUniversal = 0x1  // Universal compaction style
 };
 
 // Compression options for different compression algorithms like Zlib
@@ -60,12 +60,9 @@ struct CompressionOptions {
   int window_bits;
   int level;
   int strategy;
-  CompressionOptions():window_bits(-14),
-                       level(-1),
-                       strategy(0){}
-  CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
-                                                       level(lev),
-                                                       strategy(strategy){}
+  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
+  CompressionOptions(int wbits, int lev, int strategy)
+      : window_bits(wbits), level(lev), strategy(strategy) {}
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -216,7 +213,6 @@ struct Options {
   // Default: 16
   int block_restart_interval;
 
-
   // Compress blocks using the specified compression algorithm.  This
   // parameter can be changed dynamically.
   //
@@ -247,7 +243,7 @@ struct Options {
   // java/C api hard to construct.
   std::vector<CompressionType> compression_per_level;
 
-  //different options for compression algorithms
+  // different options for compression algorithms
   CompressionOptions compression_opts;
 
   // If non-nullptr, use the specified filter policy to reduce disk reads.
@@ -326,7 +322,6 @@ struct Options {
   // will be 20MB, total file size for level-2 will be 200MB,
   // and total file size for level-3 will be 2GB.
 
-
   // by default 'max_bytes_for_level_base' is 10MB.
   uint64_t max_bytes_for_level_base;
   // by default 'max_bytes_for_level_base' is 10.
@@ -484,10 +479,19 @@ struct Options {
   // order.
   int table_cache_remove_scan_count_limit;
 
-  // size of one block in arena memory allocation.
-  // If <= 0, a proper value is automatically calculated (usually 1/10 of
+  // Size of one block in arena memory allocation.
+  //
+  // If <= 0, a proper value is automatically calculated (usually about 1/10 of
   // writer_buffer_size).
   //
+  // There are two additonal restriction of the The specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
   // Default: 0
   size_t arena_block_size;
 
@@ -572,7 +576,12 @@ struct Options {
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
-  enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
+  enum {
+    NONE,
+    NORMAL,
+    SEQUENTIAL,
+    WILLNEED
+  } access_hint_on_compaction_start;
 
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
@@ -622,7 +631,7 @@ struct Options {
   // Default: emtpy vector -- no user-defined statistics collection will be
   // performed.
   std::vector<std::shared_ptr<TablePropertiesCollector>>
-    table_properties_collectors;
+  table_properties_collectors;
 
   // Allows thread-safe inplace updates. Requires Updates iff
   // * key exists in current memtable
@@ -644,7 +653,7 @@ struct Options {
 // the block cache. It will not page in data from the OS cache or data that
 // resides in storage.
 enum ReadTier {
-  kReadAllTier    = 0x0, // data in memtable, block cache, OS cache or storage
+  kReadAllTier = 0x0,    // data in memtable, block cache, OS cache or storage
   kBlockCacheTier = 0x1  // data in memtable or block cache
 };
 
@@ -697,13 +706,14 @@ struct ReadOptions {
         prefix_seek(false),
         snapshot(nullptr),
         prefix(nullptr),
-        read_tier(kReadAllTier) {
-  }
-  ReadOptions(bool cksum, bool cache) :
-              verify_checksums(cksum), fill_cache(cache),
-              prefix_seek(false), snapshot(nullptr), prefix(nullptr),
-              read_tier(kReadAllTier) {
-  }
+        read_tier(kReadAllTier) {}
+  ReadOptions(bool cksum, bool cache)
+      : verify_checksums(cksum),
+        fill_cache(cache),
+        prefix_seek(false),
+        snapshot(nullptr),
+        prefix(nullptr),
+        read_tier(kReadAllTier) {}
 };
 
 // Options that control write operations
@@ -730,10 +740,7 @@ struct WriteOptions {
   // and the write may got lost after a crash.
   bool disableWAL;
 
-  WriteOptions()
-      : sync(false),
-        disableWAL(false) {
-  }
+  WriteOptions() : sync(false), disableWAL(false) {}
 };
 
 // Options that control flush operations
@@ -742,9 +749,7 @@ struct FlushOptions {
   // Default: true
   bool wait;
 
-  FlushOptions()
-      : wait(true) {
-  }
+  FlushOptions() : wait(true) {}
 };
 
 }  // namespace rocksdb
diff --git a/util/arena_impl.cc b/util/arena_impl.cc
index d5c2a537e2af3c9b722703ade9c61e3291c4da10..5125e2364113fffa33bc195d5c90c7c7e1eab919 100644
--- a/util/arena_impl.cc
+++ b/util/arena_impl.cc
@@ -8,71 +8,86 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/arena_impl.h"
+#include <algorithm>
 
 namespace rocksdb {
 
-ArenaImpl::ArenaImpl(size_t block_size) {
-  if (block_size < kMinBlockSize) {
-    block_size_ = kMinBlockSize;
-  } else if (block_size > kMaxBlockSize) {
-    block_size_ = kMaxBlockSize;
-  } else {
-    block_size_ = block_size;
+const size_t ArenaImpl::kMinBlockSize = 4096;
+const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
+static const int kAlignUnit = sizeof(void*);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
+  block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
   }
 
-  blocks_memory_ = 0;
-  alloc_ptr_ = nullptr;  // First allocation will allocate a block
-  alloc_bytes_remaining_ = 0;
+  return block_size;
+}
+
+ArenaImpl::ArenaImpl(size_t block_size)
+    : kBlockSize(OptimizeBlockSize(block_size)) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
 }
 
 ArenaImpl::~ArenaImpl() {
-  for (size_t i = 0; i < blocks_.size(); i++) {
-    delete[] blocks_[i];
+  for (const auto& block : blocks_) {
+    delete[] block;
   }
 }
 
-char* ArenaImpl::AllocateFallback(size_t bytes) {
-  if (bytes > block_size_ / 4) {
+char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
     // Object is more than a quarter of our block size.  Allocate it separately
     // to avoid wasting too much space in leftover bytes.
-    char* result = AllocateNewBlock(bytes);
-    return result;
+    return AllocateNewBlock(bytes);
   }
 
   // We waste the remaining space in the current block.
-  alloc_ptr_ = AllocateNewBlock(block_size_);
-  alloc_bytes_remaining_ = block_size_;
+  auto block_head = AllocateNewBlock(kBlockSize);
+  alloc_bytes_remaining_ = kBlockSize - bytes;
 
-  char* result = alloc_ptr_;
-  alloc_ptr_ += bytes;
-  alloc_bytes_remaining_ -= bytes;
-  return result;
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    return unaligned_alloc_ptr_;
+  }
 }
 
 char* ArenaImpl::AllocateAligned(size_t bytes) {
-  const int align = sizeof(void*);    // We'll align to pointer size
-  assert((align & (align-1)) == 0);   // Pointer size should be a power of 2
-  size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
-  size_t slop = (current_mod == 0 ? 0 : align - current_mod);
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
   size_t needed = bytes + slop;
   char* result;
   if (needed <= alloc_bytes_remaining_) {
-    result = alloc_ptr_ + slop;
-    alloc_ptr_ += needed;
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
     alloc_bytes_remaining_ -= needed;
   } else {
     // AllocateFallback always returned aligned memory
-    result = AllocateFallback(bytes);
+    result = AllocateFallback(bytes, true /* aligned */);
   }
-  assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0);
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
   return result;
 }
 
 char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
-  char* result = new char[block_bytes];
+  char* block = new char[block_bytes];
   blocks_memory_ += block_bytes;
-  blocks_.push_back(result);
-  return result;
+  blocks_.push_back(block);
+  return block;
 }
 
 }  // namespace rocksdb
diff --git a/util/arena_impl.h b/util/arena_impl.h
index b5a6842472a37b847b31006f40a5202e502762ec..538385ccc03845b29ef313fc84fe216ce5814400 100644
--- a/util/arena_impl.h
+++ b/util/arena_impl.h
@@ -22,49 +22,54 @@ namespace rocksdb {
 
 class ArenaImpl : public Arena {
  public:
+  // No copying allowed
+  ArenaImpl(const ArenaImpl&) = delete;
+  void operator=(const ArenaImpl&) = delete;
+
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
   explicit ArenaImpl(size_t block_size = kMinBlockSize);
   virtual ~ArenaImpl();
 
-  virtual char* Allocate(size_t bytes);
+  virtual char* Allocate(size_t bytes) override;
 
-  virtual char* AllocateAligned(size_t bytes);
+  virtual char* AllocateAligned(size_t bytes) override;
 
   // Returns an estimate of the total memory usage of data allocated
-  // by the arena (including space allocated but not yet used for user
+  // by the arena (exclude the space allocated but not yet used for future
   // allocations).
-  //
-  // TODO: Do we need to exclude space allocated but not used?
   virtual const size_t ApproximateMemoryUsage() {
-    return blocks_memory_ + blocks_.capacity() * sizeof(char*);
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
   }
 
-  virtual const size_t MemoryAllocatedBytes() {
+  virtual const size_t MemoryAllocatedBytes() override {
     return blocks_memory_;
   }
 
  private:
-  char* AllocateFallback(size_t bytes);
-  char* AllocateNewBlock(size_t block_bytes);
-
-  static const size_t kMinBlockSize = 4096;
-  static const size_t kMaxBlockSize = 2 << 30;
-
   // Number of bytes allocated in one block
-  size_t block_size_;
-
-  // Allocation state
-  char* alloc_ptr_;
-  size_t alloc_bytes_remaining_;
-
+  const size_t kBlockSize;
   // Array of new[] allocated memory blocks
-  std::vector<char*> blocks_;
+  typedef std::vector<char*> Blocks;
+  Blocks blocks_;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+  char* AllocateFallback(size_t bytes, bool aligned);
+  char* AllocateNewBlock(size_t block_bytes);
 
   // Bytes of memory in blocks allocated so far
-  size_t blocks_memory_;
-
-  // No copying allowed
-  ArenaImpl(const ArenaImpl&);
-  void operator=(const ArenaImpl&);
+  size_t blocks_memory_ = 0;
 };
 
 inline char* ArenaImpl::Allocate(size_t bytes) {
@@ -73,12 +78,16 @@ inline char* ArenaImpl::Allocate(size_t bytes) {
   // them for our internal use).
   assert(bytes > 0);
   if (bytes <= alloc_bytes_remaining_) {
-    char* result = alloc_ptr_;
-    alloc_ptr_ += bytes;
+    unaligned_alloc_ptr_ -= bytes;
     alloc_bytes_remaining_ -= bytes;
-    return result;
+    return unaligned_alloc_ptr_;
   }
-  return AllocateFallback(bytes);
+  return AllocateFallback(bytes, false /* unaligned */);
 }
 
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
 }  // namespace rocksdb
diff --git a/util/arena_test.cc b/util/arena_test.cc
index 12aa7f7fe51b3b525a57ea62bef614a5c6304ae2..4a3d1bd433950f4e286b76b7237cde0dcc00081e 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -57,8 +57,33 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
   ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
 }
 
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+  ArenaImpl arena(kBlockSize);
+  ASSERT_EQ(0, arena.ApproximateMemoryUsage());
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  ASSERT_EQ(mem_usage, kBlockSize);
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    usage = arena.ApproximateMemoryUsage();
+  }
+  ASSERT_GT(usage, mem_usage);
+}
+
 TEST(ArenaImplTest, Simple) {
-  std::vector<std::pair<size_t, char*> > allocated;
+  std::vector<std::pair<size_t, char*>> allocated;
   ArenaImpl arena_impl;
   const int N = 100000;
   size_t bytes = 0;
@@ -68,8 +93,9 @@ TEST(ArenaImplTest, Simple) {
     if (i % (N / 10) == 0) {
       s = i;
     } else {
-      s = rnd.OneIn(4000) ? rnd.Uniform(6000) :
-          (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
     }
     if (s == 0) {
       // Our arena disallows size 0 allocations.
@@ -89,7 +115,7 @@ TEST(ArenaImplTest, Simple) {
     bytes += s;
     allocated.push_back(std::make_pair(s, r));
     ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
-    if (i > N/10) {
+    if (i > N / 10) {
       ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
     }
   }