Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-sum-seq-pooling-op

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-sum-seq-pooling-op
test=develop
b4a32eaf · Qiao Longfei · 936926aa · af91d41a · b4a32eaf · b4a32eaf
47 changed file
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -18,7 +18,7 @@ function(copy TARGET)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -185,7 +185,8 @@ copy(cmake_cache
  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
  DSTS ${FLUID_INSTALL_DIR})
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
 # paddle fluid version
 execute_process(

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -127,6 +127,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
 paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
 paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -12,6 +12,5 @@ endif(NOT WIN32)
 if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
+  add_subdirectory(train)
 endif()
-add_subdirectory(train)
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
  virtual bool IsMultiDeviceTransfer() { return false; }
  const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
  }
  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+  handler(op->Inputs());
+  handler(op->Outputs());
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    op->Run(*local_scope, place_);
    if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-      for (auto& input : op->Inputs()) {
+                          &(ctx->cur_ref_cnts_));
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
    }
    if (FLAGS_benchmark) {

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
    const ProgramDesc& prog, size_t block_id) {
  auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
  std::unordered_map<std::string, T> ref_cnts;
-  for (auto var_desc : block.AllVars()) {
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    auto type = var_desc->Proto()->type().type();
+    for (auto& name_pair : name_map) {
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
+      for (auto& name : name_pair.second) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
+        auto* var_desc = block.FindVar(name);
-    }
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
-  }
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
-  for (auto op_desc : block.AllOps()) {
+            type != proto::VarType::SELECTED_ROWS) {
-    for (auto& input : op_desc->Inputs()) {
+          continue;
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
        }
-      }
-    }
-    for (auto& output : op_desc->Outputs()) {
+        auto it = ref_cnts.find(name);
-      for (auto output_name : output.second) {
+        if (it != ref_cnts.end()) {
-        if (!ignored_vars.count(output_name)) {
+          ++it->second;
-          if (ref_cnts.count(output_name))
+        } else {
-            ++ref_cnts[output_name];
+          ref_cnts[name] = 1;
-          else
-            ref_cnts[output_name] = 1;
        }
      }
    }
+  };
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
  }
  return ref_cnts;
 }

--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -44,89 +44,6 @@ namespace ir {
  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
-template <typename UnaryOperation>
-LoDTensor tensor_apply(const LoDTensor& vec, UnaryOperation f) {
-  LoDTensor vec_y;
-  vec_y.Resize(vec.dims());
-  const float* x = vec.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec.numel(); i++) {
-    y[i] = f(x[i]);
-  }
-  return vec_y;
-}
-void tensor_apply_inplace(LoDTensor* vec, float (*f)(float)) {
-  float* data = vec->mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec->numel(); i++) {
-    data[i] = f(data[i]);
-  }
-}
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
-                               BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec_a.numel(); i++) {
-    y[i] = f(a[i], b[i]);
-  }
-  return vec_y;
-}
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise_broadcast(const LoDTensor& vec_a,
-                                         const LoDTensor& vec_b,
-                                         BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(vec_b.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(vec_a.dims()[0], vec_b.dims()[0]);
-  PADDLE_ENFORCE_EQ(vec_b.dims()[1], 1);
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  size_t a_height = vec_a.dims()[0];
-  size_t a_width = vec_a.dims()[1];
-  for (size_t h = 0; h < a_height; h++) {
-    for (size_t w = 0; w < a_width; ++w) {
-      *(y++) = f(*(a++), b[h]);
-    }
-  }
-  return vec_y;
-}
-// reshape to two dimensions {A, B * C * ...}
-void make_tensor_2d(LoDTensor* tensor_to_reshape) {
-  auto dims_count = tensor_to_reshape->dims().size();
-  PADDLE_ENFORCE_GT(dims_count, 0);
-  int size2 = 1;
-  for (int i = 1; i < dims_count; i++) {
-    size2 *= tensor_to_reshape->dims()[i];
-  }
-  tensor_to_reshape->Resize(make_ddim({tensor_to_reshape->dims()[0], size2}));
-}
-void recompute_conv_weights(LoDTensor* weights, LoDTensor* tmp) {
-  // remember the weights tensor shape {A, B, C, ...}
-  auto weights_shape = weights->dims();
-  // reduce the weights to 2d {A, B * C * ...}
-  make_tensor_2d(weights);
-  // make tmp tensor 2d by adding 1 as second dim {A, 1}
-  make_tensor_2d(tmp);
-  *weights =
-      tensor_apply_eltwise_broadcast(*weights, *tmp, std::multiplies<float>());
-  // reshape weights to the original dims {A, B, C, ...}
-  weights->Resize(weights_shape);
-}
 void recompute_bias_and_weights(const Scope* scope,
                                ir::Node* conv_weight,            //
                                const ir::Node& bn_scale,         //
@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope,
                                const ir::Node& bn_variance,      //
                                LoDTensor* eltwise_y_in_tensor,   //
                                float epsilon) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  // Re-compute bias of conv2d from BN
  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope,
      scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
  auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
-  auto std_tensor = LoDTensor();
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-  std_tensor.Resize(bn_bias_tensor.dims());
+                                       scale_tensor->numel(), 1);
-  std_tensor =
+  EigenVectorArrayMap variance_array(
-      tensor_apply(*variance_tensor, [&](float x) { return x + epsilon; });
+      variance_tensor->mutable_data<float>(platform::CPUPlace()),
+      variance_tensor->numel(), 1);
+  ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
+                                      mean_tensor->numel(), 1);
+  ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
+                                         bn_bias_tensor.numel(), 1);
-  using EigenVectorArrayMap =
+  // variance will not be used anymore, so make it std_array and then tmp_array
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  variance_array += epsilon;
+  variance_array = variance_array.sqrt();
+  variance_array = scale_array / variance_array;
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
-  EigenVectorArrayMap std_vec(
+  eltwise_y_in_array =
-      std_tensor.mutable_data<float>(platform::CPUPlace()), std_tensor.numel(),
+      ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
-      1);
-  std_vec = std_vec.sqrt();
-  auto tmp_tensor =
-      tensor_apply_eltwise(*scale_tensor, std_tensor, std::divides<float>());
-  auto tensor_minus = tensor_apply_eltwise(*eltwise_y_in_tensor, *mean_tensor,
-                                           std::minus<float>());
-  auto tensor_mul =
-      tensor_apply_eltwise(tensor_minus, tmp_tensor, std::multiplies<float>());
-  *eltwise_y_in_tensor =
-      tensor_apply_eltwise(tensor_mul, bn_bias_tensor, std::plus<float>());
  // Re-compute weight of conv2d from BN
-  auto* current_param =
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-      scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
-  recompute_conv_weights(current_param, &tmp_tensor);
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+  weights_array_2d.colwise() *= variance_array;
 }
 std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
      }
    }
  }
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
+  member_.reset();
 }
 }  // namespace framework

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,7 @@ class ParallelExecutor {
 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
 #ifdef PADDLE_WITH_CUDA
  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return FindVarInternal(name);
 }
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
  /// Caller doesn't own the returned Variable.
  Variable* FindVar(const std::string& name) const;
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
  const Scope* parent() const { return parent_; }
  /// Find the scope or an ancestor scope that contains the given variable.

--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 class AdadeltaOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of AdadeltaOp should not be null.");
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    auto input_data_type =

--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto avg_squared_grad_out_tensor =
        ctx.Output<framework::Tensor>("AvgSquaredGradOut");

--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -21,25 +22,31 @@ namespace operators {
 template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const DeviceContext& context,
+  void operator()(const DeviceContext &context,
-                  const framework::SelectedRows& grad,
+                  const framework::SelectedRows &grad,
-                  const framework::Tensor& learning_rate, T epsilon,
+                  const framework::Tensor &learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param);
+                  framework::Tensor *moment, framework::Tensor *param);
 };
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *param_var = ctx.InputVar("Param");
-    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
    param_out_tensor->mutable_data<T>(ctx.GetPlace());
    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto* grad_var = ctx.InputVar("Grad");
+    auto *grad_var = ctx.InputVar("Grad");
    if (grad_var->IsType<framework::LoDTensor>()) {
      auto param = framework::EigenVector<T>::Flatten(
          *ctx.Input<framework::Tensor>("Param"));
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
          *ctx.Input<framework::Tensor>("Grad"));
      auto moment = framework::EigenVector<T>::Flatten(
          *ctx.Input<framework::Tensor>("Moment"));
-      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+      auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+      auto *place = ctx.template device_context<DeviceContext>().eigen_device();
      moment_out.device(*place) = moment + grad * grad;
      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
      if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto* lr = learning_rate->data<T>();
+        auto *lr = learning_rate->data<T>();
        param_out.device(*place) =
            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
      } else {
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
      }
    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      auto *param_tensor = ctx.Input<framework::Tensor>("Param");
      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
-      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
      SparseAdagradFunctor<DeviceContext, T> functor;

--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
        row_numel_(row_numel),
        row_count_(row_count) {}
-  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
-    int64_t beg = 0, end = row_count_ - 1;
-    while (beg <= end) {
-      auto mid = ((beg + end) >> 1);
-      if (rows_[mid] == row)
-        return mid;
-      else if (rows_[mid] < row)
-        beg = mid + 1;
-      else
-        end = mid - 1;
-    }
-    return -1;
-  }
  inline HOSTDEVICE void operator()(size_t i) const {
-    int64_t row = i / row_numel_;
+    auto row_idx =
-    auto row_idx = BinarySearchInRows(row);
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
    T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
    // The following code is the same as dense
@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
    using paddle::framework::LoDTensor;
    using paddle::operators::detail::Ref;

--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
                   "Input(LearningRate) of AdamaxOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
                   "Input(Beta1Pow) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of AdamaxOp should not be null.");

--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");

--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(
        ctx->HasInput("LearningRate"),
        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of DecayedAdagradOp should not be null.");

--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");

--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
                   "Input(Grad) of FTRL should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                   "Input(LearningRate) of FTRL should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of FTRL should not be null.");

--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
 class FTRLOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto* param_out = ctx.Output<Tensor>("ParamOut");
    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");

--- a/paddle/fluid/operators/math/algorithm.h
+++ b/paddle/fluid/operators/math/algorithm.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <cstdint>  // for int64_t
+#include <numeric>
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
+  int64_t beg = 0, end = num - 1;
+  while (beg <= end) {
+    auto mid = ((beg + end) >> 1);
+    if (x[mid] == val)
+      return mid;
+    else if (x[mid] < val)
+      beg = mid + 1;
+    else
+      end = mid - 1;
+  }
+  return -1;
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <set>
 #include <unordered_map>
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 namespace paddle {

--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"

--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/sequence_pooling.h"
 #include <string>
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
 namespace paddle {
 namespace operators {
@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
    }
    auto lod = input.lod()[0];
    auto& place = *context.eigen_device();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      Tensor in_t =
          input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      if (pooltype == "AVERAGE") {
        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
      } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+        if (h > 0) {
+          const T* in_data = in_t.data<T>();
+          T* out_data = out_t.mutable_data<T>(context.GetPlace());
+          blas.VCOPY(w, in_data, out_data);
+          for (int64_t r = 1; r != h; ++r) {
+            blas.AXPY(w, 1., in_data + r * w, out_data);
+          }
+        }
      } else if (pooltype == "SQRT") {
        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                              std::sqrt(static_cast<T>(h));
@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
    }
    auto lod = in_grad->lod()[0];
    auto& place = *context.eigen_device();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
                                   static_cast<int>(lod[i + 1]));
@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
      if (pooltype == "AVERAGE") {
        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
      } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+        const T* out_g_data = out_g_t.data<T>();
+        T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
+        for (int r = 0; r != h; ++r) {
+          blas.VCOPY(w, out_g_data, in_g_data + r * w);
+        }
      } else if (pooltype == "SQRT") {
        in_g_e.device(place) =
            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);

--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
                   "Input(velocity) of Momentum should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                   "Input(LearningRate) of Momentum should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of Momentum should not be null.");

--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -46,6 +46,17 @@ template <typename T>
 class MomentumOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
    auto param = ctx.Input<framework::Tensor>("Param");

--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -23,6 +23,12 @@ template <typename T>
 class MomentumOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
    auto param = ctx.Input<framework::Tensor>("Param");

--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -31,8 +31,8 @@ class BlockingQueue {
  // is a workaround and a simplified version of framework::Channel as it
  // doesn't support GPU and it implements on buffered blocking queue.
 public:
-  explicit BlockingQueue(size_t capacity)
+  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
-      : capacity_(capacity), closed_(false) {
+      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
    PADDLE_ENFORCE_GT(
        capacity_, 0,
        "The capacity of a reader::BlockingQueue must be greater than 0.");
@@ -72,7 +72,9 @@ class BlockingQueue {
    if (!queue_.empty()) {
      PADDLE_ENFORCE_NOT_NULL(elem);
      *elem = queue_.front();
-      queue_.pop_front();
+      if (LIKELY(!speed_test_mode_)) {
+        queue_.pop_front();
+      }
      send_cv_.notify_one();
      return true;
    } else {
@@ -114,6 +116,7 @@ class BlockingQueue {
 private:
  size_t capacity_;
+  bool speed_test_mode_;
  bool closed_;
  std::deque<T> queue_;

--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
 private:
  LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims)
+                         const std::vector<framework::DDim>& dims,
-      : queue_(capacity), dims_(dims) {}
+                         bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode), dims_(dims) {}
 public:
  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
 class LoDTensorBlockingQueueHolder {
 public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+                bool speed_test_mode = false) {
    PADDLE_ENFORCE(
        queue_ == nullptr,
        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
  }
  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {

--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
  q.Receive(&b);
  EXPECT_EQ(a.val_, b.val_);
 }
+TEST(BlockingQueue, speed_test_mode) {
+  size_t queue_size = 10;
+  BlockingQueue<size_t> q1(queue_size, false);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Send(i);
+  }
+  size_t b;
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Receive(&b);
+    EXPECT_EQ(b, i);
+  }
+  EXPECT_EQ(q1.Size(), 0);
+  BlockingQueue<size_t> q2(queue_size, true);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Send(i);
+  }
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Receive(&b);
+    EXPECT_EQ(b, 0);
+  }
+  EXPECT_EQ(q2.Size(), queue_size);
+}
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
                   "Input(Grad) of RmspropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Moment"),
                   "Input(Moment) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(param_out) of RmspropOp should not be null.");

--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T>
+struct DenseRmspropGradFunctor {
+  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
+  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
+  const T *grad_;
+};
+template <typename T>
+struct SparseRmspropGradFunctor {
+  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
+                                  int64_t row_numel, int64_t row_count)
+      : grad_(grad),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+  HOSTDEVICE inline T operator()(int64_t idx) const {
+    auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
+    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
+  }
+  const T *grad_;
+  const int64_t *rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+};
+template <typename T, typename GradFunctor>
+struct UncenteredRmspropFunctor {
+  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
+                           T epsilon, T momentum,
+                           const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+  }
+  T *param_;
+  T *ms_;
+  T *mom_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+template <typename T, typename GradFunctor>
+struct CenteredRmspropFunctor {
+  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
+                         T rho, T epsilon, T momentum,
+                         const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        mean_grad_(mean_grad),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
+    T mom_out = momentum_ * mom_[idx] +
+                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+    mean_grad_[idx] = mg_out;
+  }
+  T *param_;
+  T *ms_;
+  T *mom_;
+  T *mean_grad_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
 template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    using LoDTensor = framework::LoDTensor;
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+    auto *grad_var = ctx.InputVar("Grad");
-    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
+    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
-    auto grad = ctx.Input<Tensor>("Grad");
+    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
+    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
+    bool centered = ctx.Attr<bool>("centered");
-    param_out->mutable_data<T>(ctx.GetPlace());
+    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
-    moment_out->mutable_data<T>(ctx.GetPlace());
+    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
+    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
+    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
-    float epsilon = ctx.Attr<float>("epsilon");
+    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
-    float rho = ctx.Attr<float>("decay");
+                      "Param and ParamOut must be the same Tensor");
-    float momentum = ctx.Attr<float>("momentum");
+    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
-    bool centered = ctx.Attr<bool>("centered");
+                      "Moment and MomentOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
+                      "MeanSquare and MeanSquareOut must be the same Tensor");
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(ms_tensor.numel());
+    if (grad_var->IsType<LoDTensor>()) {
+      auto &grad_tensor = grad_var->Get<LoDTensor>();
+      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
+        auto &place =
+            *ctx.template device_context<DeviceContext>().eigen_device();
+        auto lr_value = lr_tensor.data<T>()[0];
+        auto p = EigenVector<T>::Flatten(p_tensor);
+        auto ms = EigenVector<T>::Flatten(ms_tensor);
+        auto g = EigenVector<T>::Flatten(grad_tensor);
+        auto mom = EigenVector<T>::Flatten(mom_tensor);
+        auto p_out = EigenVector<T>::Flatten(*param_out);
+        auto mom_out = EigenVector<T>::Flatten(*moment_out);
+        auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto mg = EigenVector<T>::Flatten(mg_tensor);
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+          mg_out.device(place) = rho * mg + (1 - rho) * g;
+          mom_out.device(place) =
+              momentum * mom +
+              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
+        } else {
+          mom_out.device(place) =
+              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
+        }
+        p_out.device(place) = p - mom_out;
+      } else {
+        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
+        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()),
+              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
+              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
+        } else {
+          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+              rho, epsilon, momentum, grad_func));
+        }
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto &grad = grad_var->Get<framework::SelectedRows>();
+      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(dev_ctx, grad, merged_grad);
+      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+      const int64_t *rows;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      auto &merged_tensor = merged_grad->value();
+      int64_t row_count = merged_grad->rows().size();
+      int64_t row_numel = merged_tensor.numel() / row_count;
+      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
+                                            row_numel, row_count);
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+      if (centered) {
-    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
+        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-    auto g = EigenVector<T>::Flatten(*grad);
+        PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
-    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+                       "MeanGrad and MeanGradOut must be the same Tensor");
+        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
-    auto p_out = EigenVector<T>::Flatten(*param_out);
+            param_out->mutable_data<T>(ctx.GetPlace()),
-    auto mom_out = EigenVector<T>::Flatten(*moment_out);
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
-    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+            moment_out->mutable_data<T>(ctx.GetPlace()),
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
-    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
+      } else {
+        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
-    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+            param_out->mutable_data<T>(ctx.GetPlace()),
-    if (centered) {
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
-      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
+            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
-      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+            rho, epsilon, momentum, grad_func));
-      mean_grad_out->mutable_data<T>(ctx.GetPlace());
+      }
-      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
-      mg_out.device(place) = rho * mg + (1 - rho) * g;
-      mom_out.device(place) = momentum * mom +
-                              lr.broadcast(grad_dsize) * g /
-                                  (ms_out - mg_out.square() + epsilon).sqrt();
    } else {
-      mom_out.device(place) =
+      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
-          momentum * mom +
-          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
    }
-    p_out.device(place) = p - mom_out;
  }
 };

--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of SGDOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),
@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
    return framework::OpKernelType(data_type, ctx.device_context());
  }
@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
 class SGDOpInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc* block) const override {
+                  framework::BlockDesc *block) const override {
-    auto input_var = op_desc.Input("Param")[0];
+    auto input_var_n = op_desc.Input("Param")[0];
-    for (auto& out_var : op_desc.Output("ParamOut")) {
+    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+    PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-          framework::proto::VarType::SELECTED_ROWS) {
+                       in_var_type == framework::proto::VarType::LOD_TENSOR,
-        block->FindRecursiveOrCreateVar(out_var).SetType(
+                   "The input Var's type should be LoDtensor or SelectedRows,"
-            framework::proto::VarType::SELECTED_ROWS);
+                   " but the received var(%s)'s type is %s",
-      } else {
+                   input_var_n, in_var_type);
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::LOD_TENSOR);
+    for (auto &out_var_n : op_desc.Output("ParamOut")) {
+      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
+      if (out_var.GetType() != in_var_type) {
+        out_var.SetType(in_var_type);
      }
    }
  }

--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -56,6 +56,12 @@ template <typename T>
 class SGDOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
    auto* param = ctx.Input<framework::Tensor>("Param");
    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -198,9 +198,9 @@ class CudnnHolder {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
    : place_(place), cudnn_holder_(nullptr) {
  SetDeviceId(place_.device);
-  compute_capability = GetCUDAComputeCapability(place_.device);
+  compute_capability_ = GetCUDAComputeCapability(place_.device);
-  multi_process = GetCUDAMultiProcessors(place_.device);
+  multi_process_ = GetCUDAMultiProcessors(place_.device);
-  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&stream_, place);
@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
    cudnn_holder_.reset(new CudnnHolder(&stream_, place));
  }
+  driver_version_ = GetCUDADriverVersion(place_.device);
+  runtime_version_ = GetCUDARuntimeVersion(place_.device);
+  LOG(INFO) << "device: " << place_.device
+            << ", CUDA Capability: " << compute_capability_
+            << ", Driver Version: " << driver_version_ / 1000 << "."
+            << (driver_version_ % 100) / 10
+            << ", Runtime Version: " << runtime_version_ / 1000 << "."
+            << (runtime_version_ % 100) / 10;
  callback_manager_.reset(new StreamCallbackManager(stream_));
 }
@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
 }
 int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability;
+  return compute_capability_;
 }
 int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process * max_threads_per_mp;
+  return multi_process_ * max_threads_per_mp_;
 }
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
  cudaStream_t stream_;
  cublasHandle_t cublas_handle_;
-  int compute_capability;
+  int compute_capability_;
-  int multi_process;
+  int runtime_version_;
-  int max_threads_per_mp;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
  mutable std::mutex mtx_;

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) (condition == 0)
 #endif
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition != 0)
+#endif
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
    bool stat, const Args&... args) {

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
  return device_prop.major * 10 + device_prop.minor;
 }
+int GetCUDARuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int runtime_version = 0;
+  PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version),
+                 "cudaRuntimeGetVersion failed in "
+                 "paddle::platform::cudaRuntimeGetVersion");
+  return runtime_version;
+}
+int GetCUDADriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int driver_version = 0;
+  PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version),
+                 "cudaDriverGetVersion failed in "
+                 "paddle::platform::GetCUDADriverVersion");
+  return driver_version;
+}
 int GetCUDAMultiProcessors(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  int count;

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
 //! Get the compute capability of the ith GPU (format: major * 10 + minor)
 int GetCUDAComputeCapability(int i);
+//! Get the runtime version of the ith GPU
+int GetCUDARuntimeVersion(int id);
+//! Get the driver version of the ith GPU
+int GetCUDADriverVersion(int id);
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -57,6 +57,10 @@ limitations under the License. */
 #include "pybind11/stl.h"
+DEFINE_bool(reader_queue_speed_test_mode, false,
+            "If set true, the queue.pop will only get data from queue but not "
+            "remove the data from queue for speed testing");
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
@@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) {
     A LoDTensor X can look like the example below. It contains 2 sequences.
     The first has length 2 and the second has length 3, as described by x.lod.
-     The first tensor dimension 6=2+3 is calculated from LoD if it's available.
+     The first tensor dimension 5=2+3 is calculated from LoD if it's available.
     It means the total number of sequence element. In X, each element has 2
-     columns, hence [6, 2].
+     columns, hence [5, 2].
      x.lod  = [[2, 3]]
      x.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8], [9, 10], [11, 12]]
+                [5, 6], [7, 8], [9, 10]]
-      x.shape = [6, 2]
+      x.shape = [5, 2]
      LoD can have multiple levels (for example, a paragraph can have multiple
      sentences and a sentence can have multiple words). In the following
@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
                               return make_ddim(shape);
                             });
              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims);
+              holder->InitOnce(capacity, dims,
+                               FLAGS_reader_queue_speed_test_mode);
              return holder->GetQueue();
            },
        py::return_value_policy::copy);

--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
         -DWITH_MKL=OFF \
         -DWITH_MKLDNN=OFF
 make -j8
-make -j8 inference_lib_dist
+make -j8 fluid_lib_dist
 ```
 ### step 2. generate program desc

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -648,25 +648,25 @@ function gen_capi_package() {
    fi
 }
-function gen_fluid_inference_lib() {
+function gen_fluid_lib() {
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
        cat <<EOF
    ========================================
-    Generating fluid inference library ...
+    Generating fluid library for train and inference ...
    ========================================
 EOF
        cmake .. -DWITH_DISTRIBUTE=OFF
-        make -j `nproc` inference_lib_dist
+        make -j `nproc` fluid_lib_dist
      fi
 }
-function tar_fluid_inference_lib() {
+function tar_fluid_lib() {
    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
        cat <<EOF
    ========================================
-    Taring fluid inference library ...
+    Taring fluid library for train and inference ...
    ========================================
 EOF
        cd ${PADDLE_ROOT}/build
@@ -675,11 +675,11 @@ EOF
      fi
 }
-function test_fluid_inference_lib() {
+function test_fluid_lib() {
    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
        cat <<EOF
    ========================================
-    Testing fluid inference library ...
+    Testing fluid library for inference ...
    ========================================
 EOF
        cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
@@ -731,9 +731,9 @@ function main() {
        ;;
      fluid_inference_lib)
        cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_inference_lib
+        gen_fluid_lib
-        tar_fluid_inference_lib
+        tar_fluid_lib
-        test_fluid_inference_lib
+        test_fluid_lib
        ;;
      check_style)
        check_style
@@ -744,8 +744,8 @@ function main() {
        assert_api_not_changed ${PYTHON_ABI:-""}
        run_test
        gen_capi_package
-        gen_fluid_inference_lib
+        gen_fluid_lib
-        test_fluid_inference_lib
+        test_fluid_lib
        assert_api_spec_approvals
        ;;
      maccheck)

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -113,7 +113,8 @@ def __bootstrap__():
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
+        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'reader_queue_speed_test_mode'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -107,6 +107,7 @@ __all__ = [
    'log',
    'crop',
    'rank_loss',
+    'margin_rank_loss',
    'elu',
    'relu6',
    'pow',
@@ -5827,6 +5828,54 @@ def rank_loss(label, left, right, name=None):
    return out
+def margin_rank_loss(label, left, right, margin=0.1, name=None):
+    """
+    Margin Ranking Loss Layer for ranking problem,
+    which compares left score and right score passed in.
+    The ranking loss can be defined as following equation:
+    .. math::
+        rank\_loss &= max(0, -label * (left - right) + margin)
+    Args:
+       label (Variable): Indicates whether the left is ranked higher than the right or not.
+       left (Variable): Ranking score for left.
+       right (Variable): Ranking score for right.
+       margin (float): Indicates the given margin.
+       name (str|None): A name for this layer (optional). If set None, the layer
+                       will be named automatically.
+    Returns:
+       Variable: The ranking loss.
+    Raises:
+       ValueError: Any of label, left, and right is not a Variable.
+    Examples:
+        .. code-block:: python
+           label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+           left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+           right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+           out = fluid.layers.margin_rank_loss(label, left, right)
+    """
+    helper = LayerHelper('margin_rank_loss', **locals())
+    if not isinstance(label, Variable):
+        raise ValueError("The label should be a Variable.")
+    if not isinstance(left, Variable):
+        raise ValueError("The left should be a Variable.")
+    if not isinstance(right, Variable):
+        raise ValueError("The right should be a Variable.")
+    out = helper.create_tmp_variable(left.dtype)
+    act = helper.create_tmp_variable(left.dtype)
+    helper.append_op(
+        type='margin_rank_loss',
+        inputs={"Label": label,
+                "X1": left,
+                "X2": right},
+        outputs={'Out': out,
+                 'Activated': act},
+        attrs={'margin': margin})
+    return out
 def pad2d(input,
          paddings=[0, 0, 0, 0],
          mode='constant',
@@ -6290,6 +6339,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
        outputs={'Out': out},
        attrs={'win_size': win_size,
               'pad_value': pad_value})
+    return out
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
            optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
            optimizer.minimize(cost)
+    Notes:
+       Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
    """
    _moment_acc_str = "moment"
    _inf_norm_acc_str = "inf_norm"
@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
            optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
            optimizer.minimize(cost)
+    Notes:
+       Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
    """
    _moment_acc_str = "moment"
@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
            optimizer = fluid.optimizer.Adadelta(
                learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
            _, params_grads = optimizer.minimize(cost)
+    Notes:
+       Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
    """
    _avg_squared_grad_acc_str = "_avg_squared_grad"
@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
              optimizer = fluid.optimizer.Ftrl(0.0001)
              _, params_grads = optimizer.minimize(cost)
+    Notes:
+       Currently, FtrlOptimizer doesn't support sparse parameter optimization.
    """
    _squared_acc_str = "squared"

--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -19,33 +19,76 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+def create_selected_rows_and_tensor(scope, place, height, row_num,
+                                    embedding_size):
+    sr = scope.var("@selected_rows@").get_selected_rows()
+    tensor = scope.var("grad").get_tensor()
+    rows = np.random.random_integers(
+        low=0, high=height - 1, size=[row_num, ]).astype('int64')
+    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
+    sr.set_height(height)
+    sr.set_rows(rows)
+    sr.get_tensor().set(sr_val, place)
+    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
+    for i in range(row_num):
+        row = rows[i]
+        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
+    tensor.set(tensor_val, place)
+    return tensor_val, sr_val
 class TestBase(unittest.TestCase):
-    def setup(self, centered, epsilon=1e-6):
+    def setup(self,
+              place,
+              is_sparse,
+              centered,
+              size,
+              row_num=None,
+              epsilon=1e-6):
        np.random.seed(5)  # fix seed
+        self.scope = fluid.global_scope()
+        self.place = place
        self.param_name = "param"
-        self.param = np.random.random((123, 321)).astype("float32")
+        self.param = np.random.random(size).astype("float32")
        self.mean_square_name = "mean_square"
-        self.mean_square = np.random.random((123, 321)).astype("float32")
+        self.mean_square = np.random.uniform(
+            low=1, high=2, size=size).astype("float32")
        self.mean_grad_name = "mean_grad"
-        self.mean_grad = np.random.random((123, 321)).astype("float32")
+        self.mean_grad = np.random.random(size).astype("float32")
        self.lr_name = "lr"
        self.learning_rate = np.array([0.01]).astype("float32")
        self.grad_name = "grad"
-        self.grad = np.random.random((123, 321)).astype("float32")
+        self.is_sparse = is_sparse
+        if self.is_sparse:
+            self.grad_sr_name = "@selected_rows@"
+            self.grad, self.grad_sr = create_selected_rows_and_tensor(
+                self.scope, place, size[0], row_num, size[1])
+        else:
+            self.grad = np.random.random(size).astype("float32")
+            grad_tensor = self.scope.var(self.grad_name).get_tensor()
+            grad_tensor.set(self.grad, place)
        self.moment_name = "moment"
-        self.moment = np.zeros((123, 321)).astype("float32")
+        self.moment = np.random.uniform(
+            low=0, high=1, size=size).astype("float32")
        self.epsilon = epsilon
        self.decay = 0.9
-        self.momentum = 0.0
+        self.momentum = 0.1
        self.centered = centered
        self.ms_out = self.decay * self.mean_square + (1 - self.decay
@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
        self.param_out = self.param - self.moment_out
-    def check(self,
-              actual_t,
-              expect_t,
-              place,
-              out_name,
-              atol=1e-5,
-              equal_nan=False):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-class TestRmspropOp(TestBase):
-    def check_with_place(self, place, centered, epsilon):
-        self.setup(centered, epsilon)
-        scope = core.Scope()
        # create and initialize Param Variable
-        param = scope.var(self.param_name).get_tensor()
+        self.param_tensor = self.scope.var(self.param_name).get_tensor()
-        param.set(self.param, place)
+        self.param_tensor.set(self.param, place)
-        mean_square = scope.var(self.mean_square_name).get_tensor()
+        self.mean_square_tensor = self.scope.var(
-        mean_square.set(self.mean_square, place)
+            self.mean_square_name).get_tensor()
+        self.mean_square_tensor.set(self.mean_square, place)
-        lr = scope.var(self.lr_name).get_tensor()
+        lr = self.scope.var(self.lr_name).get_tensor()
        lr.set(self.learning_rate, place)
-        grad = scope.var(self.grad_name).get_tensor()
+        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
-        grad.set(self.grad, place)
+        self.moment_tensor.set(self.moment, place)
-        moment = scope.var(self.moment_name).get_tensor()
+        if self.centered:
-        moment.set(self.moment, place)
+            self.mean_grad_tensor = self.scope.var(
+                self.mean_grad_name).get_tensor()
+            self.mean_grad_tensor.set(self.mean_grad, place)
-        # create and run sgd operator
+    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-        if self.centered:
-            mean_grad = scope.var(self.mean_grad_name).get_tensor()
+class TestRmspropOp(TestBase):
-            mean_grad.set(self.mean_grad, place)
+    def check_with_place(self,
+                         place,
-            rmsprop_op = Operator(
+                         is_sparse,
-                "rmsprop",
+                         centered,
-                Param=self.param_name,
+                         size,
-                Grad=self.grad_name,
+                         row_num=None,
-                MeanSquare=self.mean_square_name,
+                         epsilon=1e-6):
-                MeanGrad=self.mean_grad_name,
+        self.setup(place, is_sparse, centered, size, row_num, epsilon)
-                Moment=self.moment_name,
+        self.run_and_check()
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
+    def run_and_check(self):
-                MeanSquareOut=self.mean_square_name,
+        grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
-                MomentOut=self.moment_name,
-                MeanGradOut=self.mean_grad_name,
+        kwargs = {
-                epsilon=self.epsilon,
+            'Param': self.param_name,
-                decay=self.decay,
+            'Grad': grad_name,
-                momentum=self.momentum,
+            'MeanSquare': self.mean_square_name,
-                centered=True)
+            'Moment': self.moment_name,
-        else:
+            'LearningRate': self.lr_name,
-            rmsprop_op = Operator(
+            'ParamOut': self.param_name,
-                "rmsprop",
+            'MeanSquareOut': self.mean_square_name,
-                Param=self.param_name,
+            'MomentOut': self.moment_name,
-                Grad=self.grad_name,
+            'epsilon': self.epsilon,
-                MeanSquare=self.mean_square_name,
+            'decay': self.decay,
-                Moment=self.moment_name,
+            'momentum': self.momentum,
-                LearningRate=self.lr_name,
+            'centered': self.centered
-                ParamOut=self.param_name,
+        }
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=False)
-        rmsprop_op.run(scope, place)
-        atol = 1e-5
-        equal_nan = False
        if self.centered:
-            atol = 1e-3
+            kwargs['MeanGrad'] = self.mean_grad_name
-            equal_nan = True
+            kwargs['MeanGradOut'] = self.mean_grad_name
+        rmsprop_op = Operator('rmsprop', **kwargs)
+        atol = 1e-6
+        rmsprop_op.run(self.scope, self.place)
        self.check(
-            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+            np.array(self.mean_square_tensor),
+            self.ms_out,
+            self.place,
+            self.mean_square_name,
+            atol=atol)
        self.check(
-            np.array(moment),
+            np.array(self.moment_tensor),
            self.moment_out,
-            place,
+            self.place,
            self.moment_name,
-            atol=atol,
+            atol=atol)
-            equal_nan=equal_nan)
        self.check(
-            np.array(param),
+            np.array(self.param_tensor),
            self.param_out,
-            place,
+            self.place,
            self.param_name,
-            atol=atol,
+            atol=atol)
-            equal_nan=equal_nan)
        if self.centered:
            self.check(
-                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+                np.array(self.mean_grad_tensor), self.mg_out, self.place,
+                self.mean_grad_name)
    def test_rmsprop(self):
        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(core.CUDAPlace(0))
+        size = (128, 320)
        for place in places:
-            self.check_with_place(place, False, 1e-6)
+            for centered in [False, True]:
-            self.check_with_place(place, False, 1e-10)
+                with fluid.scope_guard(core.Scope()):
-            self.check_with_place(place, True, 1e-6)
+                    self.check_with_place(
-            self.check_with_place(place, True, 1e-10)
+                        place, is_sparse=False, centered=centered, size=size)
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=512,
+                        size=size)
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=60,
+                        size=size)
 if __name__ == "__main__":