diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6aa2e1715b92d73aa4e5e97d5e52ffac51451d80..d3379a663db4613e529cdba4ce22111765ff59cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,7 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
@@ -179,6 +180,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/xxhash)    # download xxhash
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
@@ -301,3 +303,8 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (ON_INFER)
+    message(WARNING "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif()
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4deaab7545c20002fedcad1cca6df54fe9783eb0
--- /dev/null
+++ b/cmake/external/xxhash.cmake
@@ -0,0 +1,46 @@
+INCLUDE(ExternalProject)
+
+set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
+set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
+set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
+
+IF(WITH_STATIC_LIB)
+  SET(BUILD_CMD make lib)
+ELSE()
+  SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+ENDIF()
+
+ExternalProject_Add(
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+    GIT_TAG         "v0.6.5"
+    PREFIX          ${XXHASH_SOURCE_DIR}
+    DOWNLOAD_NAME   "xxhash"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND
+    BUILD_COMMAND     ${BUILD_CMD}
+    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+    TEST_COMMAND      ""
+)
+
+set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+
+add_library(xxhash STATIC IMPORTED GLOBAL)
+set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
+include_directories(${XXHASH_INCLUDE_DIR})
+add_dependencies(xxhash extern_xxhash)
+
+LIST(APPEND external_project_dependencies xxhash)
+
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
+  IF(ANDROID)
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
+  ENDIF()
+ENDIF()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 67cca09b64c1ed7a503a886e78347d786eae0de7..1047b6f998a74e42114b9deab4f0e7ba1af36835 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -14,6 +14,9 @@
 
 # make package for paddle fluid shared and static library
 function(copy TARGET)
+    if (NOT ON_INFER)
+      message(WARNING "Turn on the ON_INFER flag when building inference_lib only.")
+    endif()
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DSTS DEPS)
@@ -31,7 +34,7 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+        add_custom_command(TARGET ${TARGET} PRE_BUILD
           COMMAND mkdir -p "${dst}"
           COMMAND cp -r "${src}" "${dst}"
           COMMENT "copying ${src} -> ${dst}")
@@ -67,6 +70,13 @@ copy(boost_lib
   DEPS boost
 )
 
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+copy(xxhash_lib
+  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS xxhash
+)
+
 if(NOT PROTOBUF_FOUND)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
@@ -186,7 +196,7 @@ copy(cmake_cache
   DSTS ${FLUID_INSTALL_DIR})
 
 # This command generates a complete fluid library for both train and inference
-add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
 
 # Following commands generate a inference-only fluid library
 # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 587632ef435ed58dce1bfec141d7dd93e794810d..0d90bf3cc12e285f5cafd80180c12ddeb4ad8b51 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -176,6 +176,7 @@ paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label'
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index cb22403de5e05a2f83438c854590376497fda918..11102bc776c2643946dd98219a3ca9ea6a1928cf 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -24,74 +24,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::vector<std::string> FindDistTrainSendVars(
-    const std::vector<ir::Node *> &nodes) {
-  std::vector<std::string> send_vars;
-  // since parameters are all in block 0,
-  // it's enough to only scan send ops in block 0
-  for (auto &node : nodes) {
-    auto op_vars = node->Op()->InputArgumentNames();
-    send_vars.reserve(send_vars.size() +
-                      std::distance(op_vars.begin(), op_vars.end()));
-    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
-  }
-  return send_vars;
-}
-
-std::vector<std::string> FindDistTrainRecvVars(
-    const std::vector<ir::Node *> &nodes) {
-  std::vector<std::string> recv_vars;
-  for (auto &node : nodes) {
-    auto op_vars = node->Op()->OutputArgumentNames();
-    recv_vars.reserve(recv_vars.size() +
-                      std::distance(op_vars.begin(), op_vars.end()));
-    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
-  }
-  return recv_vars;
-}
-
-bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
-                   const std::vector<std::string> &recv_vars) {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
-      }
-
-      if (!(var.find(".block") == std::string::npos &&
-            var.find(".pserver") == std::string::npos) &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
-  }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  return checker(output_var_names, send_vars) ||
-         checker(input_var_names, recv_vars);
-}
-
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   // Make the nodes id start from 0.
   Node::ResetId();
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 5d6da9f1d76a3c0fc64b7ff35264e385cf19a14b..d6d42f5e92080aa57445e2d6ce59aa3faa89d22d 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -44,6 +44,7 @@ class Node {
     return op_desc_.get();
   }
 
+  // Please don't use this API!
   int id() const { return id_; }
 
   bool IsOp() const { return type_ == Type::kOperation; }
@@ -92,6 +93,7 @@ class Node {
   Node() = delete;
 
   static int count_;
+  // Please don't use this API or make this public.
   static void ResetId() { count_ = 0; }
   DISABLE_COPY_AND_ASSIGN(Node);
 };
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
index 6d7b6a4ada8729e3698dab5d2b1861aac632be79..0ad6a709008406257d6c0a220bce38bb24e188cd 100644
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -18,6 +18,82 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
+// directly, because there are many vector<LoDTensor> used accross the project,
+// and some of them are treated as LoDTensorArray.
+#if !defined(PADDLE_ON_INFERENCE)
+
 using LoDTensorArray = std::vector<LoDTensor>;
-}
+
+#else  // !PADDLE_ON_INFERENCE
+
+#pragma message "LoDTensorArray is replaced with the inference one."
+/*
+ * A LoDTensorArray which will not deallocate buffer when resized, fix the data
+ * diff in inference, and more performance friendly in the concurrency
+ * scenerios.
+ */
+class LoDTensorArray {
+ public:
+  LoDTensorArray() = default;
+
+  using iterator = std::vector<LoDTensor>::iterator;
+  using const_iterator = std::vector<LoDTensor>::const_iterator;
+
+  const_iterator begin() const { return array_.begin(); }
+  const_iterator end() const { return array_.begin() + size_; }
+  iterator begin() { return array_.begin(); }
+  iterator end() { return array_.begin() + size_; }
+
+  void push_back(const LoDTensor& x) {
+    if (size_ < array_.size()) {
+      array_[size_++] = x;
+    } else {
+      array_.push_back(x);
+      ++size_;
+    }
+  }
+  void resize(size_t size) {
+    if (array_.size() < size) {
+      array_.resize(size);
+    }
+    size_ = size;
+  }
+
+  void emplace_back() { array_.emplace_back(); }
+
+  void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
+
+  LoDTensor& back() { return array_.back(); }
+
+  size_t space() const { return array_.size(); }
+
+  void reserve(size_t size) {
+    // Naive warning to tell user this array might be to large. The memory and
+    // buffer used by this TensorArray will not be deleted during the training
+    // and inference phase, so attention not to make it expand too long.
+    if (size > 800UL) {
+      LOG(WARNING) << "TensorArray has more than 800 items";
+    }
+    array_.reserve(size);
+  }
+
+  bool empty() const { return size_ == 0UL; }
+  void clear() { size_ = 0UL; }
+
+  LoDTensor& operator[](size_t id) { return array_[id]; }
+  const LoDTensor& operator[](size_t id) const { return array_[id]; }
+  LoDTensor& at(size_t id) { return array_.at(id); }
+  const LoDTensor& at(size_t id) const { return array_.at(id); }
+
+  size_t size() const { return size_; }
+
+ private:
+  size_t size_{0};
+  std::vector<LoDTensor> array_;
+};
+#endif  // !PADDLE_ON_INFERENCE
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 440e0509be727ec2b84abc76fca44edda11f8a0a..30c8a26c3d2f0068674aa70b4ff875a2f73c1dca 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -121,10 +121,6 @@ class OpDesc {
 
   BlockDesc *Block() { return this->block_; }
 
-  const BlockDesc &BlockRef() const { return *this->block_; }
-
-  void SetBlock(BlockDesc *block) { this->block_ = block; }
-
  private:
   template <typename MapType>
   static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 14f9f36812d690fc4a7440f2e7e6a85e9993a535..9462620e829ec815e1553f6378a67463ea3b8aa3 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -78,6 +78,8 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  std::list<Scope*>& kids() const { return kids_; }
+
   /// Find if a scope exists in the kid scopes
   bool HasKid(const Scope* scope) const;
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9794a193bcfaae19552b1f6fbdf2dab2898033d5..dbbe8bcba69a1d87e21c8eae18834fb708e8b1e4 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -30,7 +30,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -40,7 +40,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api)
+    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 0ddd5d53f836131fe37d412fc867cb38f11ee2b5..e2027b7cb4d584ffcc48624d2c01e65a61829975 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,7 +18,8 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
+        )
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
@@ -31,10 +32,17 @@ function(inference_api_test TARGET_NAME)
         set(multiValueArgs ARGS)
         cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-        cc_test(${TARGET_NAME}
-                SRCS ${inference_test_SRC}
-                DEPS "${inference_deps}"
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+	if (WITH_GPU)
+		cc_test(${TARGET_NAME}
+			SRCS ${inference_test_SRC}
+			DEPS "${inference_deps}"
+			ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
+        else()
+		cc_test(${TARGET_NAME}
+			SRCS ${inference_test_SRC}
+			DEPS "${inference_deps}"
+			ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+	endif()
         if(inference_test_ARGS)
             set_tests_properties(${TARGET_NAME}
                     PROPERTIES DEPENDS "${inference_test_ARGS}")
@@ -42,7 +50,8 @@ function(inference_api_test TARGET_NAME)
     endif(WITH_TESTING)
 endfunction(inference_api_test)
 
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index eec665767164dc6e79738890947c54d7f7217037..54c37fe64590aa82d7100c93c4c5c4ee36491421 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -82,6 +82,7 @@ bool AnalysisPredictor::Init(
 
   // Get the feed_target_names and fetch_target_names
   PrepareFeedFetch();
+
   return true;
 }
 
@@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
@@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   executor_->Run();
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 5a9f4d36959d4ee7ca16dec769d9d1283b8787cb..b7dc2067332278c1c38df4beefb5059efe76417f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 7cda9c5d8a8366bd097491f37f5352a10e4fb16c..d06ab8f8c8e3c0adf4a4074eb1450012126e83ea 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 7882f6a53c7ce9a2486158ea9b50c018d1814091..4e4ab47ca9c5e37f2714ebd48d250c23c7e9b117 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor {
   std::vector<framework::OpDesc *> fetchs_;
   // Do not use unique_ptr, use parent scope to delete
   framework::Scope *sub_scope_{nullptr};
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 03f0f726eb61c2619c7719a865383090f86b5b7f..49683eab07a2f5bc008272038a27bdb277396284 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
@@ -61,8 +62,8 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
-if (NOT WIN32) 
-  if (USE_TENSORRT AND WITH_GPU) 
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
       include_directories("${TENSORRT_INCLUDE_DIR}")
       link_directories("${TENSORRT_LIB_DIR}")
   endif()
@@ -77,13 +78,14 @@ endif(NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
                ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
@@ -107,7 +109,7 @@ if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
-    glog gflags protobuf snappystream snappy z
+    glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()
 set(DEPS ${DEPS}
@@ -120,7 +122,7 @@ endif(NOT WIN32)
 
 if(WITH_GPU)
   if(NOT WIN32)
-    if (USE_TENSORRT) 
+    if (USE_TENSORRT)
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
     endif()
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 6e682b69583e00ab1bbe1c0d22e21ae114a61a76..340e84d9312c20e2d10eb4c0a13066511d5d2eb5 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -16,7 +16,7 @@ if [ $2 == ON ]; then
 fi
 if [ $3 == ON ]; then
   use_gpu_list='true false'
-else    
+else
   use_gpu_list='false'
 fi
 
@@ -60,7 +60,8 @@ for WITH_STATIC_LIB in ON OFF; do
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=simple_on_word2vec \
     -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+    -DON_INFER=ON
   make -j
   word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
   if [ -d $word2vec_model ]; then
@@ -80,10 +81,11 @@ for WITH_STATIC_LIB in ON OFF; do
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=vis_demo \
     -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+    -DON_INFER=ON
   make -j
   for use_gpu in $use_gpu_list; do
-    for vis_demo_name in $vis_demo_list; do 
+    for vis_demo_name in $vis_demo_list; do
       ./vis_demo \
         --modeldir=$DATA_DIR/$vis_demo_name/model \
         --data=$DATA_DIR/$vis_demo_name/data.txt \
@@ -95,7 +97,7 @@ for WITH_STATIC_LIB in ON OFF; do
       fi
     done
   done
-  
+
   # --------tensorrt mobilenet------
   if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
     rm -rf *
@@ -106,8 +108,9 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
       -DUSE_TENSORRT=$USE_TENSORRT \
       -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-      -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
-    make -j 
+      -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR \
+      -DON_INFER=ON
+    make -j
     ./trt_mobilenet_demo \
       --modeldir=$DATA_DIR/mobilenet/model \
       --data=$DATA_DIR/mobilenet/data.txt \
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ae6c6dc9f44650c1c62f5be5448864d817513b1
--- /dev/null
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+
+namespace paddle {
+namespace details {
+
+// Should be called after the parameters are loaded.
+void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
+  if (flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      // TODO(Superjomn) should avoid the case when a TensorArray is a
+      // parameter.
+      if (var_name == "feed" || var_name == "fetch") continue;
+      if (var->Type() == typeid(framework::LoDTensorArray)) {
+        VLOG(4) << "collect " << var_name;
+        arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
+      }
+    }
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+
+    VLOG(3) << "Collect " << arrays_.size() << " arrays";
+    flag_ = false;
+  }
+}
+
+// Should be called when `Run` finished.
+void TensorArrayBatchCleaner::ResetTensorArray() {
+  for (auto *arr : arrays_) {
+    arr->clear();
+  }
+}
+
+}  // namespace details
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..a39449ff0e67786815dfb8d2d30d79dcdba757d7
--- /dev/null
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace details {
+
+// Clean the TensorArray each batch to make the behavior the same with the
+// training phase.
+struct TensorArrayBatchCleaner {
+  // Fix the tensor array not clear in the inference scenarios.
+  void CollectTensorArrays(framework::Scope *scope);
+  void ResetTensorArray();
+
+ private:
+  bool flag_{true};
+  std::vector<framework::LoDTensorArray *> arrays_;
+};
+
+}  // namespace details
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 6399476680c0af83a6d26aea952c58543bdce9ae..e0416ff953b61f56a2ca1a45cb382d40a6cffa4a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -228,6 +228,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_rnn1, profile) {
   contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.use_gpu = false;
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 78ef6f207eadea6799864fe22889103b468d1780..0d51cb92618170cb422cb49ba63ba54ae6608ef4 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
+op_library(hash_op DEPS xxhash)
 op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index b6cb935814e25b31d4104f9ce24fe952680cb491..0d32cae0e1e5ff274793df50e854283d8e2f7bf8 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor {
   bool tensor_on_gpu_;
   size_t beam_size_;
   int end_id_;
+  // TODO(Superjomn) Here might result serious performance issue in the
+  // concurrency
+  // scenarios.
   const LoDTensorArray& step_ids_origin_;
   const LoDTensorArray& step_scores_origin_;
   LoDTensorArray step_ids_ = LoDTensorArray();
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ebe71a3d7ae270a10a45f4805652415078b363
--- /dev/null
+++ b/paddle/fluid/operators/hash_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hash_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class HashOp : public framework::OperatorWithKernel {
+ public:
+  HashOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of HashOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of HashOp should not be null.");
+
+    auto dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(dims.size(), 2UL,
+                      "The input of hash_op's dimensions must be 2");
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(dims.size() + 1);
+    // copy all dims except the last one
+    for (size_t i = 0u; i != dims.size() - 1; ++i) {
+      out_dims.emplace_back(dims[i]);
+    }
+    int num_hash = ctx->Attrs().Get<int>("num_hash");
+    out_dims.emplace_back(num_hash);
+    // keep the last dim to 1
+    out_dims.emplace_back(1);
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class HashOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+**Hash Operator**
+$$Out = scale * X$$
+)DOC");
+    AddAttr<int>("num_hash", "").SetDefault(1);
+    AddAttr<int>("mod_by", "").SetDefault(100000);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
+REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel<int>, ops::HashKerel<int64_t>);
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9781bb0f453642cefb3eb59a05389c339a7de39d
--- /dev/null
+++ b/paddle/fluid/operators/hash_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+extern "C" {
+#include <xxhash.h>
+}
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+// template <typename DeviceContext, typename T>
+template <typename T>
+class HashKerel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out_t = context.Output<framework::LoDTensor>("Out");
+    auto* in_t = context.Input<framework::LoDTensor>("X");
+    int mod_by = context.Attr<int>("mod_by");
+    int num_hash = context.Attr<int>("num_hash");
+    auto* output = out_t->mutable_data<T>(context.GetPlace());
+
+    auto in_dims = in_t->dims();
+    auto in_lod = in_t->lod();
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    auto seq_length = in_dims[0];
+    auto last_dim = in_dims[in_dims.size() - 1];
+    auto* input = in_t->data<T>();
+    for (int idx = 0; idx < seq_length; ++idx) {
+      for (int ihash = 0; ihash != num_hash; ++ihash) {
+        output[idx * num_hash + ihash] =
+            XXH64(input, sizeof(int) * last_dim, ihash) % mod_by;
+      }
+      input += last_dim;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index a4d1e812a54e8d92750c991d09860ab974e3e56d..3226a727b1f5f6de9e97ce2068381be7c9b69ff3 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Lookup Table Operator.
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 58463dc4d6fd7cc3454de766814a947fee161070..e504c4f0cd5c0feaef4a251fad57b389a10a2ce7 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
       const auto *table = table_t.value().data<T>();
       auto *output = output_t->mutable_data<T>(context.GetPlace());
 
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
       for (int64_t i = 0; i < ids_numel; ++i) {
         if (padding_idx != kNoPadding && ids[i] == padding_idx) {
           memset(output + i * row_width, 0, row_width * sizeof(T));
@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GE(ids[i], 0);
           auto id_index = table_t.Index(ids[i]);
           PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          memcpy(output + i * row_width, table + id_index * row_width,
-                 row_width * sizeof(T));
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
         }
       }
     }
@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
 
-      framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_num);
-      for (int64_t i = 0; i < ids_num; i++) {
-        new_rows.push_back(ids_data[i]);
-      }
+      std::vector<int64_t> new_rows;
+      new_rows.resize(ids_num);
+      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_num, table_dim[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      d_table->set_height(table_dim[0]);
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table_value->data<T>();
-
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      // FIXME(minqiyang):
+      // memory optimization will NOT reuse Tensor with SelectedRows
+      // so we could just share the tensor here directly.
+      // However, the InferVarType method will infer the output SelectedRows
+      // to Tensor sometimes, which is a bug, so we will add an attribute
+      // here to indicate the inplace and remove this attribute after
+      // the InferVarType's bug was fixed
+      bool grad_inplace = context.Attr<bool>("grad_inplace");
+      if (grad_inplace) {
+        d_table_value->ShareDataWith(*d_output);
+      } else {
+        d_table_value->mutable_data<T>(context.GetPlace());
+
+        d_table->set_height(table_dim[0]);
+
+        auto *d_output_data = d_output->data<T>();
+        auto *d_table_data = d_table_value->data<T>();
+
+        auto d_output_dims = d_output->dims();
+        PADDLE_ENFORCE_EQ(
+            d_table_value->dims(),
+            framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
+        memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      }
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index 78d6e5ff554b9cd9facae85be166a697e0b75337..eabb51d370aff709e289e1fc727aa2dbb551d82e 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
@@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
 
 add_executable(demo_trainer demo_trainer.cc)
@@ -62,5 +64,5 @@ target_link_libraries(demo_trainer
         ${ARCHIVE_END}
         ${MATH_LIB}
         ${MKLDNN_LIB}
-        glog gflags protobuf snappystream snappy z
+        glog gflags protobuf snappystream snappy z xxhash
         ${EXTERNAL_LIB})
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 85493c10549c290330ed09b9f28accb7a980de6a..5a71382fb14b64989502c34d8ac0aa13c62bc7d0 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -95,9 +95,9 @@ function cmake_gen() {
                 exit 1
             fi
         fi
-    else 
+    else
         if [ "$1" != "" ]; then
-            echo "using python abi: $1"     
+            echo "using python abi: $1"
             if [ "$1" == "cp27-cp27m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
                 export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
@@ -119,7 +119,7 @@ function cmake_gen() {
            fi
         fi
     fi
-    
+
     if [ "$SYSTEM" == "Darwin" ]; then
         WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
         WITH_AVX=${WITH_AVX:-ON}
@@ -127,7 +127,7 @@ function cmake_gen() {
     else
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
-    
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -394,8 +394,8 @@ EOF
         export http_proxy=
         export https_proxy=
         # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $1     
-        # make install should also be test when unittest 
+        ctest --output-on-failure -j $1
+        # make install should also be test when unittest
         make install -j 8
         pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
@@ -659,7 +659,7 @@ function gen_fluid_lib() {
     Generating fluid library for train and inference ...
     ========================================
 EOF
-        cmake .. -DWITH_DISTRIBUTE=OFF
+        cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
         make -j `nproc` fluid_lib_dist
         make -j `nproc` inference_lib_dist
       fi
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 48d2857a678856924e324c1188c378dfa333dc5c..4bfa89d9facf1d368e3018a248dc090c81c3402e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -156,6 +156,7 @@ __all__ = [
     'maxout',
     'sequence_reverse',
     'affine_channel',
+    'hash',
 ]
 
 
@@ -1991,17 +1992,17 @@ def sequence_slice(input, offset, length, name=None):
     """
     **Sequence Slice Layer**
 
-    The layer crops a subsequence from given sequence with given start 
+    The layer crops a subsequence from given sequence with given start
     offset and subsequence length.
 
     It only supports sequence data (LoDTensor with lod_level equal to 1).
 
     .. code-block:: text
-    
+
 	- Case:
 
             Given the input Variable **input**:
-                
+
                 input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
                 input.lod = [[3, 2]],
                 input.dims = (5, 2),
@@ -2009,16 +2010,16 @@ def sequence_slice(input, offset, length, name=None):
             with offset.data = [[0], [1]] and length.data = [[2], [1]],
 
             the output Variable will be
-                
+
                 out.data = [[a1, a2], [b1, b2], [e1, e2]],
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
-	
-    NOTE: The first dimension size of **input**, **offset** and **length** 
+
+    NOTE: The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
-    
+
     Args:
-        input(Variable): The input Variable which consists of the complete 
+        input(Variable): The input Variable which consists of the complete
                          sequences.
         offset(Variable): The offset to slice each sequence.
         length(Variable): The length of each subsequence.
@@ -2037,7 +2038,7 @@ def sequence_slice(input, offset, length, name=None):
                               dtype='float32', lod_level=1)
              offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
              length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, 
+             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
     helper = LayerHelper("sequence_slice", **locals())
@@ -2420,12 +2421,12 @@ def layer_norm(input,
         param_attr(ParamAttr|None): The parameter attribute for the learnable
             gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
             omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as scale. The 
-            :attr:`param_attr` is initialized as 1 if it is added. Default None. 
+            a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default None.
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
             bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as bias. The 
+            a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default None.
         act(str): Activation to be applied to the output of layer normalizaiton.
                   Default None.
@@ -3043,8 +3044,8 @@ def sequence_unpad(x, length, name=None):
     """
     **Sequence Unpad Layer**
 
-    This layer removes the padding data in the input sequences and convert 
-    them into sequences with actual length as output, identitied by lod 
+    This layer removes the padding data in the input sequences and convert
+    them into sequences with actual length as output, identitied by lod
     information.
 
     .. code-block:: text
@@ -3054,9 +3055,9 @@ def sequence_unpad(x, length, name=None):
 	Given input Variable **x**:
 	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
 		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
-		      [11.0, 12.0, 13.0, 14.0, 15.0]], 
-     
-	in which there are 3 sequences padded to length 5, and the acutal length 
+		      [11.0, 12.0, 13.0, 14.0, 15.0]],
+
+	in which there are 3 sequences padded to length 5, and the acutal length
 	specified by input Variable **length**:
 
 	    length.data = [[2], [3], [4]],
@@ -3064,7 +3065,7 @@ def sequence_unpad(x, length, name=None):
 	after unpadding, the output Variable will be:
 
 	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-	    out.lod = [[2, 3, 4]]      
+	    out.lod = [[2, 3, 4]]
 
     Args:
         x(Variable): Input Variable which contains the padded sequences with
@@ -5499,9 +5500,9 @@ def roi_align(input,
     Examples:
         .. code-block:: python
 
-            align_out = fluid.layers.roi_align(input=x, 
-                                               rois=rois, 
-                                               pooled_height=7, 
+            align_out = fluid.layers.roi_align(input=x,
+                                               rois=rois,
+                                               pooled_height=7,
                                                pooled_width=7,
                                                spatial_scale=0.5,
                                                sampling_ratio=-1)
@@ -7518,7 +7519,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
     Useful for replacing spatial batch norm with its equivalent fixed
     transformation. The input also can be 2D tensor and applies a affine
     transformation in second dimension.
-    
+
     Args:
         x (Variable): Feature map input can be a 4D tensor with order NCHW
             or NHWC. It also can be a 2D tensor and the affine transformation
@@ -7551,3 +7552,31 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
         attrs={"data_layout": data_layout},
         outputs={"Out": out})
     return out
+
+
+def hash(input, hash_size, num_hash=1, name=None):
+    """
+    hash the input
+     Args:
+        input (Variable): The input variable which is a one-hot word.
+        hash_size (int): The space size for hash algorithm.
+        num_hash (int): The times of hash, default 1.
+        name (str, default None): The name of this layer.
+     Returns:
+        Variable: The hash result variable which is a LoDTensor.
+     Examples:
+        .. code-block:: python
+            word_dict = paddle.dataset.imdb.word_dict()
+            x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
+            out = fluid.layers.hash(input=x, len(word_dict))
+    """
+    helper = LayerHelper('hash', **locals())
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='hash',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'num_hash': num_hash,
+               'mod_by': hash_size})
+    return out
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 25d43be3b7520b53cb2ff2af8583cea2330acfbf..a4503e75671d7d12ff84bb538776f8e6c832b9d1 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -478,7 +478,7 @@ class EditDistance(MetricBase):
                 "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
             )
         avg_distance = self.total_distance / self.seq_num
-        avg_instance_error = self.instance_error / self.seq_num
+        avg_instance_error = self.instance_error / float(self.seq_num)
         return avg_distance, avg_instance_error
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index ab44954811562b8f74e368a551e855948f90af87..27c67edf4f62dd3c5d396826348f8da4513667ba 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word,
             name=pos_enc_param_name,
             trainable=False,
             initializer=fluid.initializer.ConstantInitializer(0.001)))
+    src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
     return layers.dropout(
         enc_input,
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1130ea39c42204283885ab1072a52db8c22f8b2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hash_op.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "hash"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'num_hash': 4, 'mod_by': 10000}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def init_test_case(self):
+        np.random.seed = 1
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        #  self.out_seq = np.ones([30, 4, 1], dtype=np.int32)
+        self.out_seq = [
+            [[9662], [9217], [1129], [8487]], [[9662], [9217], [1129], [8487]],
+            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
+            [[9407], [6715], [6949], [8094]], [[8473], [694], [5142], [2479]],
+            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
+            [[4372], [9456], [8204], [6695]], [[6897], [3218], [2013], [1241]],
+            [[8473], [694], [5142], [2479]], [[4372], [9456], [8204], [6695]],
+            [[4372], [9456], [8204], [6695]], [[8473], [694], [5142], [2479]],
+            [[9407], [6715], [6949], [8094]], [[9369], [4525], [8935], [9210]],
+            [[4372], [9456], [8204], [6695]], [[4372], [9456], [8204], [6695]],
+            [[9369], [4525], [8935], [9210]], [[6897], [3218], [2013], [1241]],
+            [[9038], [7951], [5953], [8657]], [[9407], [6715], [6949], [8094]],
+            [[9662], [9217], [1129], [8487]], [[9369], [4525], [8935], [9210]],
+            [[9038], [7951], [5953], [8657]], [[9662], [9217], [1129], [8487]],
+            [[9369], [4525], [8935], [9210]], [[1719], [5986], [9919], [3421]],
+            [[4372], [9456], [8204], [6695]], [[9038], [7951], [5953], [8657]]
+        ]
+        self.out_seq = np.array(self.out_seq)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()