diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 077072f6eadb0c48f4ae32f94828613d89ed01c9..a3e682e54ac496e37ed4a33a7b30d9fdca381d9d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -18,7 +18,7 @@ function(copy TARGET)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DSTS DEPS)
     cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
 
     list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
     list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -185,7 +185,8 @@ copy(cmake_cache
   SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
   DSTS ${FLUID_INSTALL_DIR})
 
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
 
 # paddle fluid version
 execute_process(
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c6dd919a93d119723b389d3a695f0af82d711a06..212724a0c7fc06edf0599ddcaad33aba1fead879 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -127,6 +127,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
 paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
 paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 519a00fb073b08f6c88de8186de187476b548fd3..48b36df6499e59fe742766b5f81fd30a9fb8b900 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -12,6 +12,5 @@ endif(NOT WIN32)
 if(WITH_INFERENCE)
   # NOTE: please add subdirectory inference at last.
   add_subdirectory(inference)
+  add_subdirectory(train)
 endif()
-
-add_subdirectory(train)
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9fbefabc841e3f6940860f60d959fee97495e4c9..d09b94a3fd32952985a37cf4246c7640d2db4f56 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
   virtual bool IsMultiDeviceTransfer() { return false; }
 
   const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
   }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 70ec6e90a4d0106b7f838e51b8357798daa4b10d..4576999c8ec756ee662ed91df198e1a96f9897fc 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+
+  handler(op->Inputs());
+  handler(op->Outputs());
+
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     op->Run(*local_scope, place_);
 
     if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
-      for (auto& input : op->Inputs()) {
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+                          &(ctx->cur_ref_cnts_));
     }
 
     if (FLAGS_benchmark) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index f0cc1338a8af50030a70a9797cbcd1b0567272b5..36b36d49c2728dbef93042158dffa26d8f56d529 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
     const ProgramDesc& prog, size_t block_id) {
   auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
   std::unordered_map<std::string, T> ref_cnts;
 
-  for (auto var_desc : block.AllVars()) {
-    auto type = var_desc->Proto()->type().type();
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
-    }
-  }
-
-  for (auto op_desc : block.AllOps()) {
-    for (auto& input : op_desc->Inputs()) {
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS) {
+          continue;
         }
-      }
-    }
 
-    for (auto& output : op_desc->Outputs()) {
-      for (auto output_name : output.second) {
-        if (!ignored_vars.count(output_name)) {
-          if (ref_cnts.count(output_name))
-            ++ref_cnts[output_name];
-          else
-            ref_cnts[output_name] = 1;
+        auto it = ref_cnts.find(name);
+        if (it != ref_cnts.end()) {
+          ++it->second;
+        } else {
+          ref_cnts[name] = 1;
         }
       }
     }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
   }
   return ref_cnts;
 }
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 95d7138381baec17d4969ef8b5287a1e70f2ac81..04459612a726bcb60f9d752dfd8927b6f5c2500d 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -44,89 +44,6 @@ namespace ir {
   GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
   GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
 
-template <typename UnaryOperation>
-LoDTensor tensor_apply(const LoDTensor& vec, UnaryOperation f) {
-  LoDTensor vec_y;
-  vec_y.Resize(vec.dims());
-  const float* x = vec.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec.numel(); i++) {
-    y[i] = f(x[i]);
-  }
-  return vec_y;
-}
-
-void tensor_apply_inplace(LoDTensor* vec, float (*f)(float)) {
-  float* data = vec->mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec->numel(); i++) {
-    data[i] = f(data[i]);
-  }
-}
-
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
-                               BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec_a.numel(); i++) {
-    y[i] = f(a[i], b[i]);
-  }
-  return vec_y;
-}
-
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise_broadcast(const LoDTensor& vec_a,
-                                         const LoDTensor& vec_b,
-                                         BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(vec_b.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(vec_a.dims()[0], vec_b.dims()[0]);
-  PADDLE_ENFORCE_EQ(vec_b.dims()[1], 1);
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  size_t a_height = vec_a.dims()[0];
-  size_t a_width = vec_a.dims()[1];
-  for (size_t h = 0; h < a_height; h++) {
-    for (size_t w = 0; w < a_width; ++w) {
-      *(y++) = f(*(a++), b[h]);
-    }
-  }
-  return vec_y;
-}
-
-// reshape to two dimensions {A, B * C * ...}
-void make_tensor_2d(LoDTensor* tensor_to_reshape) {
-  auto dims_count = tensor_to_reshape->dims().size();
-  PADDLE_ENFORCE_GT(dims_count, 0);
-
-  int size2 = 1;
-  for (int i = 1; i < dims_count; i++) {
-    size2 *= tensor_to_reshape->dims()[i];
-  }
-  tensor_to_reshape->Resize(make_ddim({tensor_to_reshape->dims()[0], size2}));
-}
-
-void recompute_conv_weights(LoDTensor* weights, LoDTensor* tmp) {
-  // remember the weights tensor shape {A, B, C, ...}
-  auto weights_shape = weights->dims();
-  // reduce the weights to 2d {A, B * C * ...}
-  make_tensor_2d(weights);
-  // make tmp tensor 2d by adding 1 as second dim {A, 1}
-  make_tensor_2d(tmp);
-
-  *weights =
-      tensor_apply_eltwise_broadcast(*weights, *tmp, std::multiplies<float>());
-  // reshape weights to the original dims {A, B, C, ...}
-  weights->Resize(weights_shape);
-}
-
 void recompute_bias_and_weights(const Scope* scope,
                                 ir::Node* conv_weight,            //
                                 const ir::Node& bn_scale,         //
@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope,
                                 const ir::Node& bn_variance,      //
                                 LoDTensor* eltwise_y_in_tensor,   //
                                 float epsilon) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
   // Re-compute bias of conv2d from BN
   PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
 
@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope,
       scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
   auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
 
-  auto std_tensor = LoDTensor();
-  std_tensor.Resize(bn_bias_tensor.dims());
-  std_tensor =
-      tensor_apply(*variance_tensor, [&](float x) { return x + epsilon; });
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  EigenVectorArrayMap variance_array(
+      variance_tensor->mutable_data<float>(platform::CPUPlace()),
+      variance_tensor->numel(), 1);
+  ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
+                                      mean_tensor->numel(), 1);
+  ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
+                                         bn_bias_tensor.numel(), 1);
 
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  // variance will not be used anymore, so make it std_array and then tmp_array
+  variance_array += epsilon;
+  variance_array = variance_array.sqrt();
+  variance_array = scale_array / variance_array;
+
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
 
-  EigenVectorArrayMap std_vec(
-      std_tensor.mutable_data<float>(platform::CPUPlace()), std_tensor.numel(),
-      1);
-  std_vec = std_vec.sqrt();
-  auto tmp_tensor =
-      tensor_apply_eltwise(*scale_tensor, std_tensor, std::divides<float>());
-  auto tensor_minus = tensor_apply_eltwise(*eltwise_y_in_tensor, *mean_tensor,
-                                           std::minus<float>());
-  auto tensor_mul =
-      tensor_apply_eltwise(tensor_minus, tmp_tensor, std::multiplies<float>());
-  *eltwise_y_in_tensor =
-      tensor_apply_eltwise(tensor_mul, bn_bias_tensor, std::plus<float>());
+  eltwise_y_in_array =
+      ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
 
   // Re-compute weight of conv2d from BN
-  auto* current_param =
-      scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  recompute_conv_weights(current_param, &tmp_tensor);
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+
+  weights_array_2d.colwise() *= variance_array;
 }
 
 std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f06bad6c78c05804e583f859906b88fb7b500372..e8adabd26540754d5b9206294eeeed79757220bf 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
       }
     }
   }
+
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
+  member_.reset();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index fd386a5987f11ff64964e95eb7e9b83572dc790c..ef09b98b2aa91a9d729b94d15dbb676dde4092b6 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,7 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
 
 #ifdef PADDLE_WITH_CUDA
   // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 1a727a2c8c759d010606d5b605823b7252b35c69..a4abd1b1283f08fb8431fbeea0cea17c8439fdd7 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
+
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index e42fff1d79d92fb7ed61768a614d8cd98f6775a0..14f9f36812d690fc4a7440f2e7e6a85e9993a535 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
   /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
+
   const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.
diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc
index d1970515f58969948b1d2db5847e4344112f77f9..89a7a49e0fa8427826f5d91274912a68f2316b61 100644
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+
 class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
                    "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
                    "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of AdadeltaOp should not be null.");
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
     ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h
index 822458daf663d99bbb38d99205f51163a0df4c4d..6c616aa03d9809e9b7725a700c7edd5ff5d6dc42 100644
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
         ctx.Output<framework::Tensor>("AvgSquaredGradOut");
diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h
index df520fcc898ff5514927dbdd845ecaecdcf3c147..0a16ce00f71586ef55007c3753e024be29d0ed56 100644
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -21,25 +22,31 @@ namespace operators {
 
 template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param);
+  void operator()(const DeviceContext &context,
+                  const framework::SelectedRows &grad,
+                  const framework::Tensor &learning_rate, T epsilon,
+                  framework::Tensor *moment, framework::Tensor *param);
 };
 
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
+    auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
 
     param_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
 
     T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
-    auto* grad_var = ctx.InputVar("Grad");
+    auto *grad_var = ctx.InputVar("Grad");
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto param = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Param"));
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
           *ctx.Input<framework::Tensor>("Grad"));
       auto moment = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Moment"));
-      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+      auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+      auto *place = ctx.template device_context<DeviceContext>().eigen_device();
 
       moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
       if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto* lr = learning_rate->data<T>();
+        auto *lr = learning_rate->data<T>();
         param_out.device(*place) =
             param - lr[0] * grad / (moment_out.sqrt() + epsilon);
       } else {
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
             lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      auto *param_tensor = ctx.Input<framework::Tensor>("Param");
       PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
 
-      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
       PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
 
       SparseAdagradFunctor<DeviceContext, T> functor;
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 4cb1f3a80e95bdda79e6451dc3cc87e899b11779..3455d1ee54e8e6e498d0b0e6932ec099af9c0b30 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
 
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
         row_numel_(row_numel),
         row_count_(row_count) {}
 
-  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
-    int64_t beg = 0, end = row_count_ - 1;
-    while (beg <= end) {
-      auto mid = ((beg + end) >> 1);
-      if (rows_[mid] == row)
-        return mid;
-      else if (rows_[mid] < row)
-        beg = mid + 1;
-      else
-        end = mid - 1;
-    }
-    return -1;
-  }
-
   inline HOSTDEVICE void operator()(size_t i) const {
-    int64_t row = i / row_numel_;
-    auto row_idx = BinarySearchInRows(row);
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
     T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
 
     // The following code is the same as dense
@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     using paddle::framework::LoDTensor;
     using paddle::operators::detail::Ref;
 
diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc
index 32062574bcf71ff96e451eaa6865b6bbfc3b1c80..d4aa4d338a2379adf985ba7f89b528bc402eda06 100644
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
                    "Input(LearningRate) of AdamaxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
                    "Input(Beta1Pow) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of AdamaxOp should not be null.");
diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h
index de644676fd9c3fabdbf01d2fd9c69858c2627ed3..7137fbd9651b4523f6d1609a0595b30758aa40df 100644
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
     auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc
index c0f2b49a04d9e88502c4b63bca493cd2b7ad1c5c..d73ae9e2721b388212cb6efa354eb4b480df9cad 100644
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->HasInput("LearningRate"),
         "Input(LearningRate) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of DecayedAdagradOp should not be null.");
diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h
index a46af078e0c6b4bf1faca0570b6a97b026864f13..5df43d33ef9f720fd20d57c53ff37cc85440b24e 100644
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
 
diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc
index 70ba25c213046cc934f46be067080d5fdbb42f9e..b77e12d6508eb07ae137b313ca91eac951afbcbe 100644
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
                    "Input(Grad) of FTRL should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                    "Input(LearningRate) of FTRL should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of FTRL should not be null.");
diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h
index 6f821e7e9944214fc5ebdf6bc7db8789b8ada6b9..8f812c9a037bfac8c1e29e32a5ad5b077c8153d1 100644
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
 class FTRLOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto* param_out = ctx.Output<Tensor>("ParamOut");
     auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
     auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
new file mode 100644
index 0000000000000000000000000000000000000000..262469beea7449eb5820b86de1ac4f790a833e79
--- /dev/null
+++ b/paddle/fluid/operators/math/algorithm.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>  // for int64_t
+#include <numeric>
+
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
+  int64_t beg = 0, end = num - 1;
+  while (beg <= end) {
+    auto mid = ((beg + end) >> 1);
+    if (x[mid] == val)
+      return mid;
+    else if (x[mid] < val)
+      beg = mid + 1;
+    else
+      end = mid - 1;
+  }
+  return -1;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f6fe2bc2f674e333c4c65f3301f9eae55cbb0fe9..1c0e88f0752a9bfe5ced7a012724fc546088d068 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <set>
 #include <unordered_map>
 
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 8dc17478e6a4e8c60741b72233eee3cd8d33428e..521c53dd0d71707c13c4364c5ee59943a03d4a2d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <map>
 #include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 69318a6598c8c69eceab7216df6382537153d34f..235b5405fb7d016f4bd8c738f75b303522183116 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/sequence_pooling.h"
 #include <string>
+
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
     }
     auto lod = input.lod()[0];
     auto& place = *context.eigen_device();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       Tensor in_t =
           input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       if (pooltype == "AVERAGE") {
         out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
       } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+        if (h > 0) {
+          const T* in_data = in_t.data<T>();
+          T* out_data = out_t.mutable_data<T>(context.GetPlace());
+          blas.VCOPY(w, in_data, out_data);
+          for (int64_t r = 1; r != h; ++r) {
+            blas.AXPY(w, 1., in_data + r * w, out_data);
+          }
+        }
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
     }
     auto lod = in_grad->lod()[0];
     auto& place = *context.eigen_device();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
                                    static_cast<int>(lod[i + 1]));
@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       if (pooltype == "AVERAGE") {
         in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
       } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+        const T* out_g_data = out_g_t.data<T>();
+        T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
+        for (int r = 0; r != h; ++r) {
+          blas.VCOPY(w, out_g_data, in_g_data + r * w);
+        }
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index 5f43c5810812260c4384349bdb709716c9a182f5..c8079a99fb8c8e05144c5390794db7757e74f6ae 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
                    "Input(velocity) of Momentum should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                    "Input(LearningRate) of Momentum should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of Momentum should not be null.");
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index a3932db1f3a50305d585cd3d5e86fa1b527df78b..5dc920c70979ad5c3d4fb3acc8d2dacaffe386c0 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -46,6 +46,17 @@ template <typename T>
 class MomentumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
     auto param = ctx.Input<framework::Tensor>("Param");
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 264726040fb566a52b8c0cdee0a1524197d2a675..40073d21b7186ad319e4988e667750c267581300 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -23,6 +23,12 @@ template <typename T>
 class MomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     auto param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
     auto param = ctx.Input<framework::Tensor>("Param");
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc..51b980acb5a08d431d96a3a92479dec09119c27e 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -31,8 +31,8 @@ class BlockingQueue {
   // is a workaround and a simplified version of framework::Channel as it
   // doesn't support GPU and it implements on buffered blocking queue.
  public:
-  explicit BlockingQueue(size_t capacity)
-      : capacity_(capacity), closed_(false) {
+  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
+      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
     PADDLE_ENFORCE_GT(
         capacity_, 0,
         "The capacity of a reader::BlockingQueue must be greater than 0.");
@@ -72,7 +72,9 @@ class BlockingQueue {
     if (!queue_.empty()) {
       PADDLE_ENFORCE_NOT_NULL(elem);
       *elem = queue_.front();
-      queue_.pop_front();
+      if (LIKELY(!speed_test_mode_)) {
+        queue_.pop_front();
+      }
       send_cv_.notify_one();
       return true;
     } else {
@@ -114,6 +116,7 @@ class BlockingQueue {
 
  private:
   size_t capacity_;
+  bool speed_test_mode_;
   bool closed_;
   std::deque<T> queue_;
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 4f7cfc24ec035349f3c85e84d876ad9b5b5493a6..3f041ff7e4e32b407729a22aab25d3aab199fee0 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
 
  private:
   LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims)
-      : queue_(capacity), dims_(dims) {}
+                         const std::vector<framework::DDim>& dims,
+                         bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode), dims_(dims) {}
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
 
 class LoDTensorBlockingQueueHolder {
  public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+                bool speed_test_mode = false) {
     PADDLE_ENFORCE(
         queue_ == nullptr,
         "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
   }
 
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284..bd7ac64b2fce2452744e4756b149ee7f291d38aa 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
   q.Receive(&b);
   EXPECT_EQ(a.val_, b.val_);
 }
+
+TEST(BlockingQueue, speed_test_mode) {
+  size_t queue_size = 10;
+  BlockingQueue<size_t> q1(queue_size, false);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Send(i);
+  }
+  size_t b;
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Receive(&b);
+    EXPECT_EQ(b, i);
+  }
+  EXPECT_EQ(q1.Size(), 0);
+
+  BlockingQueue<size_t> q2(queue_size, true);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Send(i);
+  }
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Receive(&b);
+    EXPECT_EQ(b, 0);
+  }
+  EXPECT_EQ(q2.Size(), queue_size);
+}
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index 2f773f222e50a440801b06a4fd997bf237b34772..f06f87e61d3a4d1fc8b864b9dd84e697fb12a006 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
                    "Input(Grad) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Moment"),
                    "Input(Moment) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(param_out) of RmspropOp should not be null.");
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 25ed32c5ebb2ff5be962ac1e3e38c970623d705c..797cd45fdcdbd5c3567d1676f37e148304ee6e2d 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+template <typename T>
+struct DenseRmspropGradFunctor {
+  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
+
+  const T *grad_;
+};
+
+template <typename T>
+struct SparseRmspropGradFunctor {
+  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
+                                  int64_t row_numel, int64_t row_count)
+      : grad_(grad),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const {
+    auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
+    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
+  }
+
+  const T *grad_;
+  const int64_t *rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+};
+
+template <typename T, typename GradFunctor>
+struct UncenteredRmspropFunctor {
+  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
+                           T epsilon, T momentum,
+                           const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
+template <typename T, typename GradFunctor>
+struct CenteredRmspropFunctor {
+  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
+                         T rho, T epsilon, T momentum,
+                         const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        mean_grad_(mean_grad),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
+    T mom_out = momentum_ * mom_[idx] +
+                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+    mean_grad_[idx] = mg_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  T *mean_grad_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
 template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using LoDTensor = framework::LoDTensor;
+    auto *grad_var = ctx.InputVar("Grad");
+    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
+    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
 
-    auto grad = ctx.Input<Tensor>("Grad");
+    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
+    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
+    bool centered = ctx.Attr<bool>("centered");
 
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
+    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
+    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
+    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
+    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
 
-    float epsilon = ctx.Attr<float>("epsilon");
-    float rho = ctx.Attr<float>("decay");
-    float momentum = ctx.Attr<float>("momentum");
-    bool centered = ctx.Attr<bool>("centered");
+    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
+                      "Param and ParamOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
+                      "Moment and MomentOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
+                      "MeanSquare and MeanSquareOut must be the same Tensor");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(ms_tensor.numel());
+
+    if (grad_var->IsType<LoDTensor>()) {
+      auto &grad_tensor = grad_var->Get<LoDTensor>();
+
+      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
+        auto &place =
+            *ctx.template device_context<DeviceContext>().eigen_device();
+        auto lr_value = lr_tensor.data<T>()[0];
+
+        auto p = EigenVector<T>::Flatten(p_tensor);
+        auto ms = EigenVector<T>::Flatten(ms_tensor);
+        auto g = EigenVector<T>::Flatten(grad_tensor);
+        auto mom = EigenVector<T>::Flatten(mom_tensor);
+
+        auto p_out = EigenVector<T>::Flatten(*param_out);
+        auto mom_out = EigenVector<T>::Flatten(*moment_out);
+        auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+
+        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto mg = EigenVector<T>::Flatten(mg_tensor);
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+          mg_out.device(place) = rho * mg + (1 - rho) * g;
+          mom_out.device(place) =
+              momentum * mom +
+              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
+        } else {
+          mom_out.device(place) =
+              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
+        }
+        p_out.device(place) = p - mom_out;
+      } else {
+        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
+        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()),
+              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
+              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
+        } else {
+          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+              rho, epsilon, momentum, grad_func));
+        }
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto &grad = grad_var->Get<framework::SelectedRows>();
+      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(dev_ctx, grad, merged_grad);
+
+      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+      const int64_t *rows;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      auto &merged_tensor = merged_grad->value();
+      int64_t row_count = merged_grad->rows().size();
+      int64_t row_numel = merged_tensor.numel() / row_count;
+      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
+                                            row_numel, row_count);
 
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto mom_out = EigenVector<T>::Flatten(*moment_out);
-    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
-
-    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    if (centered) {
-      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
-      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
-      mean_grad_out->mutable_data<T>(ctx.GetPlace());
-      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
-
-      mg_out.device(place) = rho * mg + (1 - rho) * g;
-      mom_out.device(place) = momentum * mom +
-                              lr.broadcast(grad_dsize) * g /
-                                  (ms_out - mg_out.square() + epsilon).sqrt();
+      if (centered) {
+        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+        PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                       "MeanGrad and MeanGradOut must be the same Tensor");
+        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()),
+            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      } else {
+        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      }
     } else {
-      mom_out.device(place) =
-          momentum * mom +
-          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
     }
-    p_out.device(place) = p - mom_out;
   }
 };
 
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index fef230e42d07a5ed73b7a7a6ab682694675bb9d2..411a126bc8e2b3a8d25f436489c13970568ccae4 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of SGDOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Grad"),
@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto input_var = op_desc.Input("Param")[0];
-    for (auto& out_var : op_desc.Output("ParamOut")) {
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
-          framework::proto::VarType::SELECTED_ROWS) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::SELECTED_ROWS);
-      } else {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::LOD_TENSOR);
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto input_var_n = op_desc.Input("Param")[0];
+    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
+    PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                       in_var_type == framework::proto::VarType::LOD_TENSOR,
+                   "The input Var's type should be LoDtensor or SelectedRows,"
+                   " but the received var(%s)'s type is %s",
+                   input_var_n, in_var_type);
+
+    for (auto &out_var_n : op_desc.Output("ParamOut")) {
+      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
+      if (out_var.GetType() != in_var_type) {
+        out_var.SetType(in_var_type);
       }
     }
   }
diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
index 243609075713305a90dc162991166ba24d54e835..d3f4eba3b24ec1ac0328ef270256cdf3abe499db 100644
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -56,6 +56,12 @@ template <typename T>
 class SGDOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index dfc079e986e93c7f02f17b299e5d6293edbedd05..4286242b2a93d7046e7349a99d1d1a09dca09113 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -198,9 +198,9 @@ class CudnnHolder {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     : place_(place), cudnn_holder_(nullptr) {
   SetDeviceId(place_.device);
-  compute_capability = GetCUDAComputeCapability(place_.device);
-  multi_process = GetCUDAMultiProcessors(place_.device);
-  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  compute_capability_ = GetCUDAComputeCapability(place_.device);
+  multi_process_ = GetCUDAMultiProcessors(place_.device);
+  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
   eigen_stream_->Reinitialize(&stream_, place);
@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     cudnn_holder_.reset(new CudnnHolder(&stream_, place));
   }
 
+  driver_version_ = GetCUDADriverVersion(place_.device);
+  runtime_version_ = GetCUDARuntimeVersion(place_.device);
+
+  LOG(INFO) << "device: " << place_.device
+            << ", CUDA Capability: " << compute_capability_
+            << ", Driver Version: " << driver_version_ / 1000 << "."
+            << (driver_version_ % 100) / 10
+            << ", Runtime Version: " << runtime_version_ / 1000 << "."
+            << (runtime_version_ % 100) / 10;
+
   callback_manager_.reset(new StreamCallbackManager(stream_));
 }
 
@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
 }
 
 int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability;
+  return compute_capability_;
 }
 
 int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process * max_threads_per_mp;
+  return multi_process_ * max_threads_per_mp_;
 }
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 79539195157d74d4d757edee5e008cbb76c93ee2..e1ff1a1746952de5aa4bead361b50af4e99bc9bc 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream_;
   cublasHandle_t cublas_handle_;
 
-  int compute_capability;
-  int multi_process;
-  int max_threads_per_mp;
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
 
   mutable std::mutex mtx_;
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f04395a8ac00f33501008aa12f22773ddda9b138..a251bfcd9914422cb6300adbbcdef3dfa79f441c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) (condition == 0)
 #endif
 
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition != 0)
+#endif
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index f599e7fbc886a60394ae4690e4160275b55b8596..8fff9844db738dbd6508569a8aaeed044e445e5f 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
   return device_prop.major * 10 + device_prop.minor;
 }
 
+int GetCUDARuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int runtime_version = 0;
+  PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version),
+                 "cudaRuntimeGetVersion failed in "
+                 "paddle::platform::cudaRuntimeGetVersion");
+  return runtime_version;
+}
+
+int GetCUDADriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int driver_version = 0;
+  PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version),
+                 "cudaDriverGetVersion failed in "
+                 "paddle::platform::GetCUDADriverVersion");
+  return driver_version;
+}
+
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index f4640d3eaa2165c35e8e14690d83e9e7e7168c0b..be44158431ff80a41f7fdf4dfd4d070667f2ac63 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
 //! Get the compute capability of the ith GPU (format: major * 10 + minor)
 int GetCUDAComputeCapability(int i);
 
+//! Get the runtime version of the ith GPU
+int GetCUDARuntimeVersion(int id);
+
+//! Get the driver version of the ith GPU
+int GetCUDADriverVersion(int id);
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8d58e018c345f7c0ca188d4380f675d2bce2a847..339a7c98c6a2bba2cd46790cecc169ef447c63ce 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -57,6 +57,10 @@ limitations under the License. */
 
 #include "pybind11/stl.h"
 
+DEFINE_bool(reader_queue_speed_test_mode, false,
+            "If set true, the queue.pop will only get data from queue but not "
+            "remove the data from queue for speed testing");
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 
@@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) {
      A LoDTensor X can look like the example below. It contains 2 sequences.
      The first has length 2 and the second has length 3, as described by x.lod.
 
-     The first tensor dimension 6=2+3 is calculated from LoD if it's available.
+     The first tensor dimension 5=2+3 is calculated from LoD if it's available.
      It means the total number of sequence element. In X, each element has 2
-     columns, hence [6, 2].
+     columns, hence [5, 2].
 
       x.lod  = [[2, 3]]
       x.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8], [9, 10], [11, 12]]
-      x.shape = [6, 2]
+                [5, 6], [7, 8], [9, 10]]
+      x.shape = [5, 2]
 
       LoD can have multiple levels (for example, a paragraph can have multiple
       sentences and a sentence can have multiple words). In the following
@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
                                return make_ddim(shape);
                              });
               auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims);
+              holder->InitOnce(capacity, dims,
+                               FLAGS_reader_queue_speed_test_mode);
               return holder->GetQueue();
             },
         py::return_value_policy::copy);
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index 41b01d33828f750f67bba5f82cb7ed6fe4d4ea0a..191da20669e185d819ec5eed55427461cc0b10e4 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
          -DWITH_MKL=OFF \
          -DWITH_MKLDNN=OFF
 make -j8
-make -j8 inference_lib_dist
+make -j8 fluid_lib_dist
 ```
 
 ### step 2. generate program desc
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b97e63ecc85872ebccb21f37bb4f2f00d49b4599..da6f5ca1586a570fb548d7b987330a8d58156e24 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -648,25 +648,25 @@ function gen_capi_package() {
     fi
 }
 
-function gen_fluid_inference_lib() {
+function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
-    Generating fluid inference library ...
+    Generating fluid library for train and inference ...
     ========================================
 EOF
         cmake .. -DWITH_DISTRIBUTE=OFF
-        make -j `nproc` inference_lib_dist
+        make -j `nproc` fluid_lib_dist
       fi
 }
 
-function tar_fluid_inference_lib() {
+function tar_fluid_lib() {
     if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
-    Taring fluid inference library ...
+    Taring fluid library for train and inference ...
     ========================================
 EOF
         cd ${PADDLE_ROOT}/build
@@ -675,11 +675,11 @@ EOF
       fi
 }
 
-function test_fluid_inference_lib() {
+function test_fluid_lib() {
     if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
-    Testing fluid inference library ...
+    Testing fluid library for inference ...
     ========================================
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
@@ -731,9 +731,9 @@ function main() {
         ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_inference_lib
-        tar_fluid_inference_lib
-        test_fluid_inference_lib
+        gen_fluid_lib
+        tar_fluid_lib
+        test_fluid_lib
         ;;
       check_style)
         check_style
@@ -744,8 +744,8 @@ function main() {
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
         gen_capi_package
-        gen_fluid_inference_lib
-        test_fluid_inference_lib
+        gen_fluid_lib
+        test_fluid_lib
         assert_api_spec_approvals
         ;;
       maccheck)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7bbdf7de89cc932e0023952e3c8e102f92b06855..41678918b8bb54078091f892ce7a519dfc8a0014 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -113,7 +113,8 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
+        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'reader_queue_speed_test_mode'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8c0ef7a82421ffc04bf669e6850e075226c09d27..43aa4a9e7c4ffe96a8f1346cd1bc3e4f309ba309 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -107,6 +107,7 @@ __all__ = [
     'log',
     'crop',
     'rank_loss',
+    'margin_rank_loss',
     'elu',
     'relu6',
     'pow',
@@ -5827,6 +5828,54 @@ def rank_loss(label, left, right, name=None):
     return out
 
 
+def margin_rank_loss(label, left, right, margin=0.1, name=None):
+    """
+    Margin Ranking Loss Layer for ranking problem,
+    which compares left score and right score passed in.
+    The ranking loss can be defined as following equation:
+
+    .. math::
+
+        rank\_loss &= max(0, -label * (left - right) + margin)
+
+    Args:
+       label (Variable): Indicates whether the left is ranked higher than the right or not.
+       left (Variable): Ranking score for left.
+       right (Variable): Ranking score for right.
+       margin (float): Indicates the given margin.
+       name (str|None): A name for this layer (optional). If set None, the layer
+                       will be named automatically.
+    Returns:
+       Variable: The ranking loss.
+    Raises:
+       ValueError: Any of label, left, and right is not a Variable.
+    Examples:
+        .. code-block:: python
+           label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+           left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+           right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+           out = fluid.layers.margin_rank_loss(label, left, right)
+    """
+    helper = LayerHelper('margin_rank_loss', **locals())
+    if not isinstance(label, Variable):
+        raise ValueError("The label should be a Variable.")
+    if not isinstance(left, Variable):
+        raise ValueError("The left should be a Variable.")
+    if not isinstance(right, Variable):
+        raise ValueError("The right should be a Variable.")
+    out = helper.create_tmp_variable(left.dtype)
+    act = helper.create_tmp_variable(left.dtype)
+    helper.append_op(
+        type='margin_rank_loss',
+        inputs={"Label": label,
+                "X1": left,
+                "X2": right},
+        outputs={'Out': out,
+                 'Activated': act},
+        attrs={'margin': margin})
+    return out
+
+
 def pad2d(input,
           paddings=[0, 0, 0, 0],
           mode='constant',
@@ -6290,6 +6339,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
         outputs={'Out': out},
         attrs={'win_size': win_size,
                'pad_value': pad_value})
+    return out
 
 
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 1b9571f6d3a6a69d1ac35f6be74b80eaa2ce6251..ed1784bd27697ed8acf235fb3a72cc201cc99bc6 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
 
             optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
             optimizer.minimize(cost)
+
+    Notes:
+       Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
 
             optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
             optimizer.minimize(cost)
+
+    Notes:
+       Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
     """
     _moment_acc_str = "moment"
 
@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
             optimizer = fluid.optimizer.Adadelta(
                 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
             _, params_grads = optimizer.minimize(cost)
+
+    Notes:
+       Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
     """
 
     _avg_squared_grad_acc_str = "_avg_squared_grad"
@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
 
               optimizer = fluid.optimizer.Ftrl(0.0001)
               _, params_grads = optimizer.minimize(cost)
+
+    Notes:
+       Currently, FtrlOptimizer doesn't support sparse parameter optimization.
     """
 
     _squared_acc_str = "squared"
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 70848e4e2239e2be160bb0c1a28a5aecd01a87dc..eb12bc741767340a3e7e3580a8b95065d4267693 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -19,33 +19,76 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+
+
+def create_selected_rows_and_tensor(scope, place, height, row_num,
+                                    embedding_size):
+    sr = scope.var("@selected_rows@").get_selected_rows()
+    tensor = scope.var("grad").get_tensor()
+
+    rows = np.random.random_integers(
+        low=0, high=height - 1, size=[row_num, ]).astype('int64')
+    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
+
+    sr.set_height(height)
+    sr.set_rows(rows)
+    sr.get_tensor().set(sr_val, place)
+
+    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
+    for i in range(row_num):
+        row = rows[i]
+        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
+
+    tensor.set(tensor_val, place)
+    return tensor_val, sr_val
 
 
 class TestBase(unittest.TestCase):
-    def setup(self, centered, epsilon=1e-6):
+    def setup(self,
+              place,
+              is_sparse,
+              centered,
+              size,
+              row_num=None,
+              epsilon=1e-6):
         np.random.seed(5)  # fix seed
 
+        self.scope = fluid.global_scope()
+        self.place = place
+
         self.param_name = "param"
-        self.param = np.random.random((123, 321)).astype("float32")
+        self.param = np.random.random(size).astype("float32")
 
         self.mean_square_name = "mean_square"
-        self.mean_square = np.random.random((123, 321)).astype("float32")
+        self.mean_square = np.random.uniform(
+            low=1, high=2, size=size).astype("float32")
 
         self.mean_grad_name = "mean_grad"
-        self.mean_grad = np.random.random((123, 321)).astype("float32")
+        self.mean_grad = np.random.random(size).astype("float32")
 
         self.lr_name = "lr"
         self.learning_rate = np.array([0.01]).astype("float32")
 
         self.grad_name = "grad"
-        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.is_sparse = is_sparse
+        if self.is_sparse:
+            self.grad_sr_name = "@selected_rows@"
+            self.grad, self.grad_sr = create_selected_rows_and_tensor(
+                self.scope, place, size[0], row_num, size[1])
+        else:
+            self.grad = np.random.random(size).astype("float32")
+            grad_tensor = self.scope.var(self.grad_name).get_tensor()
+            grad_tensor.set(self.grad, place)
 
         self.moment_name = "moment"
-        self.moment = np.zeros((123, 321)).astype("float32")
+        self.moment = np.random.uniform(
+            low=0, high=1, size=size).astype("float32")
 
         self.epsilon = epsilon
         self.decay = 0.9
-        self.momentum = 0.0
+        self.momentum = 0.1
         self.centered = centered
 
         self.ms_out = self.decay * self.mean_square + (1 - self.decay
@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
 
         self.param_out = self.param - self.moment_out
 
-    def check(self,
-              actual_t,
-              expect_t,
-              place,
-              out_name,
-              atol=1e-5,
-              equal_nan=False):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-
-
-class TestRmspropOp(TestBase):
-    def check_with_place(self, place, centered, epsilon):
-        self.setup(centered, epsilon)
-        scope = core.Scope()
-
         # create and initialize Param Variable
-        param = scope.var(self.param_name).get_tensor()
-        param.set(self.param, place)
+        self.param_tensor = self.scope.var(self.param_name).get_tensor()
+        self.param_tensor.set(self.param, place)
 
-        mean_square = scope.var(self.mean_square_name).get_tensor()
-        mean_square.set(self.mean_square, place)
+        self.mean_square_tensor = self.scope.var(
+            self.mean_square_name).get_tensor()
+        self.mean_square_tensor.set(self.mean_square, place)
 
-        lr = scope.var(self.lr_name).get_tensor()
+        lr = self.scope.var(self.lr_name).get_tensor()
         lr.set(self.learning_rate, place)
 
-        grad = scope.var(self.grad_name).get_tensor()
-        grad.set(self.grad, place)
+        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
+        self.moment_tensor.set(self.moment, place)
 
-        moment = scope.var(self.moment_name).get_tensor()
-        moment.set(self.moment, place)
+        if self.centered:
+            self.mean_grad_tensor = self.scope.var(
+                self.mean_grad_name).get_tensor()
+            self.mean_grad_tensor.set(self.mean_grad, place)
 
-        # create and run sgd operator
+    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
-        if self.centered:
-            mean_grad = scope.var(self.mean_grad_name).get_tensor()
-            mean_grad.set(self.mean_grad, place)
-
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                MeanGrad=self.mean_grad_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                MeanGradOut=self.mean_grad_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=True)
-        else:
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=False)
-
-        rmsprop_op.run(scope, place)
-
-        atol = 1e-5
-        equal_nan = False
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self,
+                         place,
+                         is_sparse,
+                         centered,
+                         size,
+                         row_num=None,
+                         epsilon=1e-6):
+        self.setup(place, is_sparse, centered, size, row_num, epsilon)
+        self.run_and_check()
+
+    def run_and_check(self):
+        grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
+
+        kwargs = {
+            'Param': self.param_name,
+            'Grad': grad_name,
+            'MeanSquare': self.mean_square_name,
+            'Moment': self.moment_name,
+            'LearningRate': self.lr_name,
+            'ParamOut': self.param_name,
+            'MeanSquareOut': self.mean_square_name,
+            'MomentOut': self.moment_name,
+            'epsilon': self.epsilon,
+            'decay': self.decay,
+            'momentum': self.momentum,
+            'centered': self.centered
+        }
 
         if self.centered:
-            atol = 1e-3
-            equal_nan = True
+            kwargs['MeanGrad'] = self.mean_grad_name
+            kwargs['MeanGradOut'] = self.mean_grad_name
+
+        rmsprop_op = Operator('rmsprop', **kwargs)
+        atol = 1e-6
+
+        rmsprop_op.run(self.scope, self.place)
 
         self.check(
-            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+            np.array(self.mean_square_tensor),
+            self.ms_out,
+            self.place,
+            self.mean_square_name,
+            atol=atol)
         self.check(
-            np.array(moment),
+            np.array(self.moment_tensor),
             self.moment_out,
-            place,
+            self.place,
             self.moment_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
         self.check(
-            np.array(param),
+            np.array(self.param_tensor),
             self.param_out,
-            place,
+            self.place,
             self.param_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
 
         if self.centered:
             self.check(
-                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+                np.array(self.mean_grad_tensor), self.mg_out, self.place,
+                self.mean_grad_name)
 
     def test_rmsprop(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
+
+        size = (128, 320)
         for place in places:
-            self.check_with_place(place, False, 1e-6)
-            self.check_with_place(place, False, 1e-10)
-            self.check_with_place(place, True, 1e-6)
-            self.check_with_place(place, True, 1e-10)
+            for centered in [False, True]:
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place, is_sparse=False, centered=centered, size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=512,
+                        size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=60,
+                        size=size)
 
 
 if __name__ == "__main__":