diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 212724a0c7fc06edf0599ddcaad33aba1fead879..6a37b5ca433a3baa1388dd4f720d782ca53e4e99 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
 paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9fbefabc841e3f6940860f60d959fee97495e4c9..d09b94a3fd32952985a37cf4246c7640d2db4f56 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
   virtual bool IsMultiDeviceTransfer() { return false; }
 
   const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
   }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 70ec6e90a4d0106b7f838e51b8357798daa4b10d..b212666637a5289c9c6cd3585655deaeed8afd4b 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+
+  handler(op->Inputs());
+  handler(op->Outputs());
+
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope*>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     op->Run(*local_scope, place_);
 
     if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
-      for (auto& input : op->Inputs()) {
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+                          &(ctx->cur_ref_cnts_));
     }
 
     if (FLAGS_benchmark) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index f0cc1338a8af50030a70a9797cbcd1b0567272b5..36b36d49c2728dbef93042158dffa26d8f56d529 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
     const ProgramDesc& prog, size_t block_id) {
   auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
   std::unordered_map<std::string, T> ref_cnts;
 
-  for (auto var_desc : block.AllVars()) {
-    auto type = var_desc->Proto()->type().type();
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
-    }
-  }
-
-  for (auto op_desc : block.AllOps()) {
-    for (auto& input : op_desc->Inputs()) {
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS) {
+          continue;
         }
-      }
-    }
 
-    for (auto& output : op_desc->Outputs()) {
-      for (auto output_name : output.second) {
-        if (!ignored_vars.count(output_name)) {
-          if (ref_cnts.count(output_name))
-            ++ref_cnts[output_name];
-          else
-            ref_cnts[output_name] = 1;
+        auto it = ref_cnts.find(name);
+        if (it != ref_cnts.end()) {
+          ++it->second;
+        } else {
+          ref_cnts[name] = 1;
         }
       }
     }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
   }
   return ref_cnts;
 }
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 8e1f93c5ebd448903d70f9668539e077875836e4..3e9353f5cf67d8de62c5551f12ea786e49190549 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ba10687d65cfbbac89cfc76879c8b202ebd03229..2840d503f1454271afb309efdd435225ab077dc0 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9f930065324f13f5aa79c214e820fb6fc2f3a166..14fcde2fe3b1c3acfc0994e9cd37a784c57826d7 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  RunImpl(scope, place);
+
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
+  }
   VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f06bad6c78c05804e583f859906b88fb7b500372..e8adabd26540754d5b9206294eeeed79757220bf 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
       }
     }
   }
+
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
+  member_.reset();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index fd386a5987f11ff64964e95eb7e9b83572dc790c..ef09b98b2aa91a9d729b94d15dbb676dde4092b6 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,7 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
 
 #ifdef PADDLE_WITH_CUDA
   // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 1a727a2c8c759d010606d5b605823b7252b35c69..a4abd1b1283f08fb8431fbeea0cea17c8439fdd7 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
+
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index e42fff1d79d92fb7ed61768a614d8cd98f6775a0..14f9f36812d690fc4a7440f2e7e6a85e9993a535 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
   /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
+
   const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d7a2eb5b38255531880fe3d2e5321024caf0c6b..69bcbc0e5891f95af4de8dfd49a25648ca920ab1 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
+      if (src_ptr == dst_ptr) {
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
+        return;
+      }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                    stream);
     } else {
@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
+    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index a1e5b967a86d10f3439db662af54bb82888027b9..793ccfc79fe56707f226477b9d50b1d972ab6a59 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
+  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
   Tensor slice_tensor = src_tensor.Slice(1, 2);
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
+    // Copy the same tensor
+    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    gpu_ctx.Wait();
+    const int* dst_ptr_tmp = dst_tensor.data<int>();
+    EXPECT_NE(src_ptr, dst_ptr_tmp);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+    }
+
     Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index e33849ef502fb10b913e7e28cbd0abdb8b8ff9bb..9d3fb811191c207c75845ef8f8486e8beac7525a 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -59,6 +59,7 @@ class VarDesc {
  public:
   explicit VarDesc(const std::string &name) {
     desc_.set_name(name);
+    // TODO(paddle-dev): Why default to lodtensor.
     desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
   }
 
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 067e0c2b8389f88639fd9b95bd680702517efee1..873e1b20a584df3ba90cf5c1a62a3879bf98ce5c 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,8 +38,12 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!IsType<T>()) {
+    if (!holder_) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
+    } else {
+      PADDLE_ENFORCE(IsType<T>(),
+                     "Variable must be type %s, the holding type is %s",
+                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index c5c1d215f4a6affae0a3bdafacec40a2aee2ca19..003dcfd3dfe5ecfd563a686bb72b061aff602f73 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  std::string* s = v->GetMutable<std::string>();
-  *s = "hello";
-
-  const std::string& ss = v->Get<std::string>();
-  EXPECT_EQ("hello", ss);
+  try {
+    v->GetMutable<std::string>();
+  } catch (std::exception& e) {
+    return;
+  }
+  EXPECT_TRUE(false);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3bc6af5241c41bd805699121d614d431d46d863f..3095dee0f0106b2408663cd32bb4fb310111eda4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -25,9 +25,11 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
   }
 #endif
 
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
     LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
@@ -335,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
   }
   return true;
 }
+
+AnalysisPredictor::~AnalysisPredictor() {
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
+#endif
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+}
+
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00..5a9f4d36959d4ee7ca16dec769d9d1283b8787cb 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
   template <typename T>
   void GetFetchOne(const framework::LoDTensor &fetchs,
                    PaddleTensor *output_data);
+  ~AnalysisPredictor();
 
  private:
   contrib::AnalysisConfig config_;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 6682e0a81b20c82aa668a249d37986386d769c83..7cda9c5d8a8366bd097491f37f5352a10e4fb16c 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -23,9 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace {
@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
   }
 #endif
 
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 031109398d8e21ad95f19f65fdd814e1782889e6..df3e3fcd9c75f03f4d9b0a7c12788f06bfdefd7f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
-op_library(fusion_lstm_op DEPS cpu_lstm_compute)
+op_library(fusion_lstm_op DEPS jit_kernel)
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
     op_library(layer_norm_op DEPS cub)
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 3e724f52e4fcb2818f961181a2830dfd653d2733..3455d1ee54e8e6e498d0b0e6932ec099af9c0b30 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
 
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
         row_numel_(row_numel),
         row_count_(row_count) {}
 
-  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
-    int64_t beg = 0, end = row_count_ - 1;
-    while (beg <= end) {
-      auto mid = ((beg + end) >> 1);
-      if (rows_[mid] == row)
-        return mid;
-      else if (rows_[mid] < row)
-        beg = mid + 1;
-      else
-        end = mid - 1;
-    }
-    return -1;
-  }
-
   inline HOSTDEVICE void operator()(size_t i) const {
-    int64_t row = i / row_numel_;
-    auto row_idx = BinarySearchInRows(row);
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
     T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
 
     // The following code is the same as dense
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 2826b82117db113d4d8c10095e89f610ca895775..e04a68717b351ddb0be5a7e70aa9297e5eb0125f 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
   }
 };
 
+class FillConstantOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
                   ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantOpVarTypeInference);
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index ae1f6d8e489039667d861a69acabf2c632ef2061..067e6a3e7cccc1f15ebdd984f3a2441339a989ab 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
  public:
-#define INIT_VEC_FUNC                                                          \
-  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
-  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
-  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
-    math::VecActivations<T, platform::jit::avx> act_functor;                   \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  } else {                                                                     \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  }
-
-#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* x = ctx.Input<LoDTensor>("X");                \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* c0 = ctx.Input<Tensor>("C0");                 \
-  auto* wx = ctx.Input<Tensor>("WeightX");            \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-
-#define INIT_BASE_SIZES                  \
-  auto x_dims = x->dims();   /* T x M*/  \
-  auto wh_dims = wh->dims(); /* D x 4D*/ \
-  const int M = x_dims[1];               \
-  const int D = wh_dims[0];              \
-  const int D2 = D * 2;                  \
-  const int D3 = D * 3;                  \
-  const int D4 = wh_dims[1];
-
-#define INIT_BASE_INPUT_DATAS                                 \
-  const T* x_data = x->data<T>();                             \
-  const T* wx_data = wx->data<T>();                           \
-  const T* wh_data = wh->data<T>();                           \
-  /* diagonal weight*/                                        \
-  const T* wc_data = bias->data<T>() + D4;                    \
-  /* for peephole only*/                                      \
-  T* checked_cell_data = nullptr;                             \
-  auto place = ctx.GetPlace();                                \
-  if (use_peepholes) {                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
-    checked_cell_data = checked_cell->mutable_data<T>(place); \
-  }
-
-/// Compute LSTM
+#define INIT_BASE_DEFINES                                   \
+  using DeviceContext = paddle::platform::CPUDeviceContext; \
+  auto* x = ctx.Input<LoDTensor>("X");                      \
+  auto* h0 = ctx.Input<Tensor>("H0");                       \
+  auto* c0 = ctx.Input<Tensor>("C0");                       \
+  auto* wx = ctx.Input<Tensor>("WeightX");                  \
+  auto* wh = ctx.Input<Tensor>("WeightH");                  \
+  auto* bias = ctx.Input<Tensor>("Bias");                   \
+  auto* xx = ctx.Output<LoDTensor>("XX");                   \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");       \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");           \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");           \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");     \
+  auto x_dims = x->dims();   /* T x M*/                     \
+  auto wh_dims = wh->dims(); /* D x 4D*/                    \
+  const int M = x_dims[1];                                  \
+  const int D = wh_dims[0];                                 \
+  const int D4 = wh_dims[1]
+
+#define INIT_OTHER_DEFINES                                                  \
+  const T* x_data = x->data<T>();                                           \
+  const T* wx_data = wx->data<T>();                                         \
+  const T* wh_data = wh->data<T>();                                         \
+  /* diagonal weight*/                                                      \
+  const T* wp_data = bias->data<T>() + D4;                                  \
+  /* for peephole only*/                                                    \
+  T* checked_cell_data = nullptr;                                           \
+  auto place = ctx.GetPlace();                                              \
+  if (use_peepholes) {                                                      \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                        \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                 \
+    checked_cell_data = checked_cell->mutable_data<T>(place);               \
+  }                                                                         \
+  const auto& ker =                                                         \
+      math::jitkernel::KernelPool::Instance()                               \
+          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
+                        const std::string&, const std::string&>(            \
+              ctx.Attr<std::string>("gate_activation"),                     \
+              ctx.Attr<std::string>("candidate_activation"),                \
+              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
+
+// Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
   blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
             wh_data, D4, static_cast<T>(1), out, D4)
 
-#define GET_Ct(ct_1, gates, ct)                   \
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
-  act_cand(D, gates, gates);                      \
-  blas.VMUL(D, gates, gates + D, gates + D);      \
-  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
-  blas.VADD(D, gates + D, gates + D2, ct)
-
-#define GET_Ht(ct, gates, ht)        \
-  /* H_t = act_cell(C_t) * ogated */ \
-  act_cell(D, ct, gates + D2);       \
-  blas.VMUL(D, gates + D2, gates + D3, ht)
-
-#define GET_Ct_NOH0C0(gates, ct)     \
-  /* C_t = igated * cgated*/         \
-  act_gate(D, gates + D, gates + D); \
-  act_cand(D, gates, gates);         \
-  blas.VMUL(D, gates, gates + D, ct)
-
-#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                \
-  act_gate(D, gates + D3, gates + D3);     \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                         \
-  /* get outgated, put W_oc * C_t on igated */      \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
-  act_gate(D, gates + D3, gates + D3);              \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
-  act_gate(D3, gates + D, gates + D);     \
-  GET_Ct(ct_1, gates, ct);                \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
-  /* get fgated and igated*/                              \
-  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
-  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
-  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
-  act_gate(D2, gates + D, gates + D);                     \
-  GET_Ct(ct_1, gates, ct);                                \
-  /* get ogated*/                                         \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
-  act_gate(D, gates + D3, gates + D3);                    \
-  GET_Ht(ct, gates, ht)
-
   void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
-
+    INIT_BASE_DEFINES;
+    INIT_OTHER_DEFINES;
     auto x_lod = x->lod();
     const int total_T = x_dims[0];
     const int N = x_lod[0].size() - 1;
@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       gate_offset = -D;
     }
 
-#define MOVE_ONE_STEP                    \
-  prev_h_data = h_out_data;              \
-  prev_c_data = c_out_data;              \
-  xx_data = xx_data + xx_offset;         \
-  h_out_data = h_out_data + gate_offset; \
-  c_out_data = c_out_data + gate_offset
-
-#define PROCESS_H0C0_DEFINES                       \
-  int bid = is_reverse ? N - 1 - i : i;            \
-  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
-  const T* prev_c_data = nullptr;                  \
-  const T* prev_h_data = nullptr;                  \
-  int tstart = 0
-
-#define PROCESS_H0C0_PEEPHOLE                                      \
-  PROCESS_H0C0_DEFINES;                                            \
-  if (h0_data) {                                                   \
-    prev_h_data = h0_data + bid * D;                               \
-    prev_c_data = c0_data + bid * D;                               \
-  } else {                                                         \
-    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                                 \
-    tstart = 1;                                                    \
-  }
-
-#define PROCESS_H0C0                                      \
-  PROCESS_H0C0_DEFINES;                                   \
-  if (h0_data) {                                          \
-    prev_h_data = h0_data + bid * D;                      \
-    prev_c_data = c0_data + bid * D;                      \
-  } else {                                                \
-    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                        \
-    tstart = 1;                                           \
-  }
-
-    if (use_peepholes) {
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0_PEEPHOLE
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
-      }
-    } else {
-      // TODO(TJ): unly workaround, clean me
-      std::function<void(T*, const T*, T*, T*)> compute_ctht;
-      if (platform::jit::MayIUse(platform::jit::avx) &&
-          act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
-          act_cell_str == "tanh" && D == 8) {
-        compute_ctht = math::lstm_compute_ctht<T>;
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_c_data = nullptr;
+      const T* prev_h_data = nullptr;
+      int tstart = 0;
+      if (h0_data) {
+        prev_h_data = h0_data + bid * D;
+        prev_c_data = c0_data + bid * D;
       } else {
-        compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
-          COMPUTE_CtHt(gates, ct_1, ct, ht);
-        };
+        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
+        tstart = 1;
+        // move one step
+        prev_h_data = h_out_data;
+        prev_c_data = c_out_data;
+        xx_data = xx_data + xx_offset;
+        h_out_data = h_out_data + gate_offset;
+        c_out_data = c_out_data + gate_offset;
       }
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
+      for (int step = tstart; step < seq_len; ++step) {
+        GEMM_WH_ADDON(1, prev_h_data, xx_data);
+        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
+                         checked_cell_data);
+        // move one step
+        prev_h_data = h_out_data;
+        prev_c_data = c_out_data;
+        xx_data = xx_data + xx_offset;
+        h_out_data = h_out_data + gate_offset;
+        c_out_data = c_out_data + gate_offset;
       }
     }
-#undef PROCESS_H0C0_DEFINES
-#undef PROCESS_H0C0_PEEPHOLE
-#undef PROCESS_H0C0
-#undef MOVE_ONE_STEP
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = platform::CPUDeviceContext;
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
+    INIT_BASE_DEFINES;
     if (x->lod()[0].size() == 2) {
       xx->Resize({x_dims[0], D4});
       SeqCompute(ctx);
       return;
     }
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
+    INIT_OTHER_DEFINES;
 
     auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
     auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       prev_c_data = reordered_c0_data;
       size_t sz = sizeof(T) * D;
       for (int i = 0; i < max_bs; ++i) {
-        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
-        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
+        blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
         reordered_h0_data += D;
         reordered_c0_data += D;
       }
@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_h_out_data = batched_h_out_data;
       T* cur_c_out_data = batched_c_out_data;
       for (int i = 0; i < max_bs; ++i) {
-        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
-        if (use_peepholes) {
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
-          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
-        }
-        act_gate(D, cur_in_data + D3, cur_in_data + D3);
-        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
+        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
         cur_in_data += D4;
         cur_c_out_data += D;
         cur_h_out_data += D;
@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       prev_h_data = batched_h_out_data;
       prev_c_data = batched_c_out_data;
     }
+
+    // compute kernel part
     const auto& batch_starts = batched_lod[0];
     const int max_seq_len = batch_starts.size() - 1;
     const int offset = tstart * max_bs * D;
     batched_input_data = batched_input_data + offset * 4;
     batched_h_out_data = batched_h_out_data + offset;
     batched_c_out_data = batched_c_out_data + offset;
-
-#define DEFINE_CUR                        \
-  T* cur_in_data = batched_input_data;    \
-  T* cur_prev_c_data = prev_c_data;       \
-  T* cur_c_out_data = batched_c_out_data; \
-  T* cur_h_out_data = batched_h_out_data
-
-#define MOVE_ONE_BATCH  \
-  cur_in_data += D4;    \
-  cur_prev_c_data += D; \
-  cur_c_out_data += D;  \
-  cur_h_out_data += D
-
-#define MOVE_ONE_STEP                  \
-  prev_c_data = batched_c_out_data;    \
-  prev_h_data = batched_h_out_data;    \
-  batched_c_out_data = cur_c_out_data; \
-  batched_h_out_data = cur_h_out_data; \
-  batched_input_data = cur_in_data
-
-    if (use_peepholes) {
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                                cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
-      }
-    } else {
-      // TODO(TJ): unly workaround, clean me
-      std::function<void(T*, const T*, T*, T*)> compute_ctht;
-      if (platform::jit::MayIUse(platform::jit::avx) &&
-          act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
-          act_cell_str == "tanh" && D == 8) {
-        compute_ctht = math::lstm_compute_ctht<T>;
-      } else {
-        compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
-          COMPUTE_CtHt(gates, ct_1, ct, ht);
-        };
-      }
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                       cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+      T* cur_in_data = batched_input_data;
+      T* cur_prev_c_data = prev_c_data;
+      T* cur_c_out_data = batched_c_out_data;
+      T* cur_h_out_data = batched_h_out_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                         cur_h_out_data, wp_data, checked_cell_data);
+        // move one batch
+        cur_in_data += D4;
+        cur_prev_c_data += D;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
       }
+      // move one step
+      prev_c_data = batched_c_out_data;
+      prev_h_data = batched_h_out_data;
+      batched_c_out_data = cur_c_out_data;
+      batched_h_out_data = cur_h_out_data;
+      batched_input_data = cur_in_data;
     }
-#undef MOVE_ONE_STEP
-#undef MOVE_ONE_BATCH
-#undef DEFINE_CUR
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     }
   }
 
-#undef COMPUTE_CtHt_PEEPHOLE
-#undef COMPUTE_CtHt
-#undef GET_Ct_NOH0C0
-#undef COMPUTE_CtHt_NOH0C0
-#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
-#undef GET_Ht
-#undef GET_Ct
 #undef GEMM_WH_ADDON
-#undef INIT_BASE_INPUT_DATAS
-#undef INIT_BASE_SIZES
-#undef INIT_BASE_INPUT_OUTPUT
-#undef INIT_VEC_FUNC
+#undef INIT_OTHER_DEFINES
+#undef INIT_BASE_DEFINES
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 248c7793560db99c0af06421bf74808422016061..7b42efd623b31a703bf51d2d157130b3120b42a4 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor) 1-dim tensor, contains a bool scalar. The output "
               "tensor of overflow operator.");
     AddComment(string::Sprintf(R"DOC(
-Overflow operator.
+Overflow %s operator.
 
 $$Out = any(X)$$
 
@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
 Out = Nan if any X contains Nan,
 Out = 0 if no Inf/Nan detected.
 If X contains both Inf/Nan, it will return the first indicator it meeted.
+
+%s
 )DOC",
                                GetName(), GetComments()));
   }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index b0276f4080b3989047f9d181e9d37a18dcfad5fa..7365bfeeb8edf09a8ad5e1cb2c61300e86bdf518 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -45,8 +45,6 @@ math_library(im2col)
 if (NOT WIN32) # windows do not support avx functions yet.
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
-# TODO(TJ): ugly workaround, clean me
-cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
 endif (NOT WIN32)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
@@ -76,3 +74,7 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
+cc_library(jit_kernel 
+    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
+    DEPS cpu_info cblas activation_functions)
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
new file mode 100644
index 0000000000000000000000000000000000000000..262469beea7449eb5820b86de1ac4f790a833e79
--- /dev/null
+++ b/paddle/fluid/operators/math/algorithm.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>  // for int64_t
+#include <numeric>
+
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
+  int64_t beg = 0, end = num - 1;
+  while (beg <= end) {
+    auto mid = ((beg + end) >> 1);
+    if (x[mid] == val)
+      return mid;
+    else if (x[mid] < val)
+      beg = mid + 1;
+    else
+      end = mid - 1;
+  }
+  return -1;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc
deleted file mode 100644
index e96d1879331974e0873e13f171414bcfa8c45953..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-#ifdef __AVX__
-template <>
-void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
-                              float* ht) {
-  namespace act = detail::forward::avx;
-  // gates: W_ch, W_ih, W_fh, W_oh
-  __m256 c, i, f, o;
-  c = _mm256_loadu_ps(gates);
-  i = _mm256_loadu_ps(gates + 8);
-  f = _mm256_loadu_ps(gates + 16);
-  o = _mm256_loadu_ps(gates + 24);
-
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/
-  c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
-  i = _mm256_loadu_ps(ct_1);
-  f = _mm256_mul_ps(i, act::Sigmoid(f));
-  f = _mm256_add_ps(c, f);
-  _mm256_storeu_ps(ct, f);
-
-  /* H_t = act_cell(C_t) * ogated */
-  o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
-  _mm256_storeu_ps(ht, o);
-}
-#endif
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h
deleted file mode 100644
index 169a9e4b47f54851ad436428416eca879b78e186..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// TODO(TJ): ugly workaround, clean me
-template <typename T>
-void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
-  // gates: W_ch, W_ih, W_fh, W_oh
-  vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
-  vec_tanh<T, platform::jit::avx>(8, gates, gates);
-  const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int d = 0; d < 8; ++d) {
-    // C_t = C_t-1 * fgated + cand_gated * igated
-    ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
-    // H_t = act_cell(C_t) * ogated
-    T tmp = ct[d] * 2;
-    tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vec_exp<T>(1, &tmp, &tmp);
-    tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
-    ht[d] = tmp * o[d];
-  }
-}
-
-#ifdef __AVX__
-namespace detail {
-namespace forward {
-namespace avx {
-__m256 Sigmoid(const __m256 a);
-__m256 Tanh(const __m256 a);
-
-}  // namespace avx
-}  // namespace forward
-}  // namespace detail
-
-template <>
-void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
-                              float* ht);
-
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 6a059968b79189458349e466079cc7a663a8e5ff..0aed253c80fc28560716cbcfa70f74ef9c84f9b6 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
 }
 
 template <>
-inline void vec_scal<float, platform::jit::avx512_common>(const int n,
-                                                          const float a,
-                                                          const float* x,
-                                                          float* y) {
+inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a,
+                                                    const float* x, float* y) {
   // TODO(TJ): enable me
   vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }
@@ -181,10 +179,10 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
 }
 
 template <>
-inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
-                                                              const float a,
-                                                              const float* x,
-                                                              float* y) {
+inline void vec_bias_sub<float, platform::jit::avx512f>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
   // TODO(TJ): enable me
   vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
 }
@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
 }
 
 template <>
-inline void vec_cross<float, platform::jit::avx512_common>(
+inline void vec_cross<float, platform::jit::avx512f>(
     const int n, const float* x, const float* y, const float* z, float* out) {
   // TODO(TJ): enable me
   vec_cross<float, platform::jit::avx>(n, x, y, z, out);
@@ -296,10 +294,10 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
 }
 
 template <>
-inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
-                                                              const float a,
-                                                              const float* x,
-                                                              float* y) {
+inline void vec_add_bias<float, platform::jit::avx512f>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
   // TODO(TJ): enable me
   vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
 }
@@ -390,9 +388,9 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
 }
 
 template <>
-inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
-                                                             const float* x,
-                                                             float* y) {
+inline void vec_sigmoid<float, platform::jit::avx512f>(const int n,
+                                                       const float* x,
+                                                       float* y) {
   // TODO(TJ): enable me
   vec_sigmoid<float, platform::jit::avx2>(n, x, y);
 }
@@ -454,9 +452,8 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
 }
 
 template <>
-inline void vec_relu<float, platform::jit::avx512_common>(const int n,
-                                                          const float* x,
-                                                          float* y) {
+inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x,
+                                                    float* y) {
   // TODO(TJ): enable me
   vec_relu<float, platform::jit::avx2>(n, x, y);
 }
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 3ce66f49ed8354c49e8af26ca6eb48fef654a40b..cd40f1b2f984126663a5711efac24fdf6d680b32 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
     TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
     TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>,
                         ref_sigmoid<float>);
   }
   TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>,
-                        ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
   }
   TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>,
-                        ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
     TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
     TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
     TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>,
                        ref_sigmoid<float>);
   }
   TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
     TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>,
-                       ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
   }
   TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
     TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx512_common>,
-                       ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
   }
   TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68b708b345334bc63b5e2e88c308d20ca6378e6b
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <iostream>
+#include <string>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+KernelPool& KernelPool::Instance() {
+  static thread_local KernelPool g_jit_kernels;
+  return g_jit_kernels;
+}
+
+std::shared_ptr<const Kernel> KernelPool::Get(const std::string& key) const {
+  if (kers_.find(key) == kers_.end()) {
+    return nullptr;
+  }
+  return kers_.at(key);
+}
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4dfda6db76fd4231be0acd1f90c98a2d62134b8
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <memory>  // for shared_ptr
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/macros.h"
+
+// Note: Only support on CPU yet.
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define AVX_FLOAT_BLOCK 8
+#define AVX2_FLOAT_BLOCK 8
+#define AVX512_FLOAT_BLOCK 16
+
+typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
+
+class Kernel {
+ public:
+  Kernel() = default;
+  virtual ~Kernel() = default;
+  int num_{0};
+  int end_{0};
+  int rest_{0};
+  DISABLE_COPY_AND_ASSIGN(Kernel);
+};
+
+class KernelPool {
+ public:
+  static KernelPool &Instance();
+
+  template <typename Ker, typename... ARGS>
+  std::shared_ptr<const Ker> Get(ARGS... args);
+
+  std::shared_ptr<const Kernel> Get(const std::string &key) const;
+
+ private:
+  KernelPool() = default;
+  std::unordered_map<std::string, std::shared_ptr<const Kernel>> kers_;
+
+  DISABLE_COPY_AND_ASSIGN(KernelPool);
+};
+
+template <typename T>
+class VMulKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+};
+
+template <typename T>
+class VAddKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+};
+
+template <typename T>
+class VScalKernel : public Kernel {
+ public:
+  virtual void Compute(const T a, const T *x, T *y) const = 0;
+  virtual void Compute(const T a, T *x) const = 0;
+};
+
+template <typename T>
+class VAddBiasKernel : public Kernel {
+ public:
+  virtual void Compute(const T a, const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VActKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VReluKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VIdentityKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VExpKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VSigmoidKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VTanhKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class LSTMKernel : public Kernel {
+ public:
+  virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
+                           /* below only used in peephole*/
+                           const T *wp_data = nullptr,
+                           T *checked = nullptr) const = 0;
+
+  // compute c1 and h1 without c0 or h0
+  virtual void ComputeC1H1(T *gates, T *ct, T *ht,
+                           /* below only used in peephole*/
+                           const T *wp_data = nullptr) const = 0;
+};
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f9ea533fccdd34a5ccf061d89ffe92687d65933
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -0,0 +1,391 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+/* VMUL JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VMulKernelImpl : public VMulKernel<T> {
+ public:
+  explicit VMulKernelImpl(int d) : VMulKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, const T* y, T* z) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      z[i] = x[i] * y[i];
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                           \
+  template <>                                           \
+  void VMulKernelImpl<float, isa, block>::Compute(      \
+      const float* x, const float* y, float* z) const { \
+    platform::dynload::vsMul(this->num_, x, y, z);      \
+  }
+
+#define MKL_DOUBLE(isa, block)                             \
+  template <>                                              \
+  void VMulKernelImpl<double, isa, block>::Compute(        \
+      const double* x, const double* y, double* z) const { \
+    platform::dynload::vdMul(this->num_, x, y, z);         \
+  }
+
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                               \
+  template <>                                           \
+  void VMulKernelImpl<float, isa, kEQ8>::Compute(       \
+      const float* x, const float* y, float* z) const { \
+    __m256 tmpx, tmpy;                                  \
+    tmpx = _mm256_loadu_ps(x);                          \
+    tmpy = _mm256_loadu_ps(y);                          \
+    tmpx = _mm256_mul_ps(tmpx, tmpy);                   \
+    _mm256_storeu_ps(z, tmpx);                          \
+  }
+
+// avx > for > mkl
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+#undef INTRI8_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+/* VADD JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VAddKernelImpl : public VAddKernel<T> {
+ public:
+  explicit VAddKernelImpl(int d) : VAddKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, const T* y, T* z) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      z[i] = x[i] + y[i];
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                           \
+  template <>                                           \
+  void VAddKernelImpl<float, isa, block>::Compute(      \
+      const float* x, const float* y, float* z) const { \
+    platform::dynload::vsAdd(this->num_, x, y, z);      \
+  }
+
+#define MKL_DOUBLE(isa, block)                             \
+  template <>                                              \
+  void VAddKernelImpl<double, isa, block>::Compute(        \
+      const double* x, const double* y, double* z) const { \
+    platform::dynload::vdAdd(this->num_, x, y, z);         \
+  }
+
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                               \
+  template <>                                           \
+  void VAddKernelImpl<float, isa, kEQ8>::Compute(       \
+      const float* x, const float* y, float* z) const { \
+    __m256 tmpx, tmpy;                                  \
+    tmpx = _mm256_loadu_ps(x);                          \
+    tmpy = _mm256_loadu_ps(y);                          \
+    tmpx = _mm256_add_ps(tmpx, tmpy);                   \
+    _mm256_storeu_ps(z, tmpx);                          \
+  }
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+/* VSCAL JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VScalKernelImpl : public VScalKernel<T> {
+ public:
+  explicit VScalKernelImpl(int d) : VScalKernel<T>() { this->num_ = d; }
+  void Compute(const T a, const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = a * x[i];
+    }
+  }
+  void Compute(const T a, T* x) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      x[i] = a * x[i];
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                                               \
+  template <>                                                               \
+  void VScalKernelImpl<float, isa, block>::Compute(const float a, float* x) \
+      const {                                                               \
+    platform::dynload::cblas_sscal(this->num_, a, x, 1);                    \
+  }
+
+#define MKL_DOUBLE(isa, block)                                                 \
+  template <>                                                                  \
+  void VScalKernelImpl<double, isa, block>::Compute(const double a, double* x) \
+      const {                                                                  \
+    platform::dynload::cblas_dscal(this->num_, a, x, 1);                       \
+  }
+
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                              \
+  template <>                                          \
+  void VScalKernelImpl<float, isa, kEQ8>::Compute(     \
+      const float a, const float* x, float* y) const { \
+    __m256 tmp;                                        \
+    __m256 scalar = _mm256_set1_ps(a);                 \
+    tmp = _mm256_loadu_ps(x);                          \
+    tmp = _mm256_mul_ps(tmp, scalar);                  \
+    _mm256_storeu_ps(y, tmp);                          \
+  }
+#define INTRI8_INPLACE_FLOAT(isa)                                          \
+  template <>                                                              \
+  void VScalKernelImpl<float, isa, kEQ8>::Compute(const float a, float* x) \
+      const {                                                              \
+    __m256 tmp;                                                            \
+    __m256 scalar = _mm256_set1_ps(a);                                     \
+    tmp = _mm256_loadu_ps(x);                                              \
+    tmp = _mm256_mul_ps(tmp, scalar);                                      \
+    _mm256_storeu_ps(x, tmp);                                              \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI8_INPLACE_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI8_INPLACE_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI8_INPLACE_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI8_INPLACE_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+/* VAddBias JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VAddBiasKernelImpl : public VAddBiasKernel<T> {
+ public:
+  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() { this->num_ = d; }
+  void Compute(const T a, const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = x[i] + a;
+    }
+  }
+};
+
+#define INTRI8_FLOAT(isa)                              \
+  template <>                                          \
+  void VAddBiasKernelImpl<float, isa, kEQ8>::Compute(  \
+      const float a, const float* x, float* y) const { \
+    __m256 tmp = _mm256_loadu_ps(x);                   \
+    tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a));       \
+    _mm256_storeu_ps(y, tmp);                          \
+  }
+
+#define INTRI16_FLOAT(isa)                             \
+  template <>                                          \
+  void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
+      const float a, const float* x, float* y) const { \
+    __m256 tmp0 = _mm256_loadu_ps(x);                  \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);              \
+    tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a));     \
+    tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a));     \
+    _mm256_storeu_ps(y, tmp0);                         \
+    _mm256_storeu_ps(y + 8, tmp1);                     \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+
+/* VRelu JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VReluKernelImpl : public VReluKernel<T> {
+ public:
+  explicit VReluKernelImpl(int d) : VReluKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = x[i] > 0 ? x[i] : 0;
+    }
+  }
+};
+
+#define INTRI8_FLOAT(isa)                                                   \
+  template <>                                                               \
+  void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                               \
+    __m256 tmp = _mm256_loadu_ps(x);                                        \
+    tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());                          \
+    _mm256_storeu_ps(y, tmp);                                               \
+  }
+
+#define INTRI16_FLOAT(isa)                                                   \
+  template <>                                                                \
+  void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+      const {                                                                \
+    __m256 zeros = _mm256_setzero_ps();                                      \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
+    tmp0 = _mm256_max_ps(tmp0, zeros);                                       \
+    tmp1 = _mm256_max_ps(tmp1, zeros);                                       \
+    _mm256_storeu_ps(y, tmp0);                                               \
+    _mm256_storeu_ps(y + 8, tmp1);                                           \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa)                                        \
+  template <>                                                           \
+  VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d)         \
+      : VReluKernel<float>() {                                          \
+    this->num_ = d;                                                     \
+    this->end_ = AVX_FLOAT_BLOCK;                                       \
+    this->rest_ = d - AVX_FLOAT_BLOCK;                                  \
+  }                                                                     \
+  template <>                                                           \
+  void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,   \
+                                                      float* y) const { \
+    __m256 zeros = _mm256_setzero_ps();                                 \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
+    __m256 tmp1 = _mm256_loadu_ps(x + this->rest_);                     \
+    tmp0 = _mm256_max_ps(tmp0, zeros);                                  \
+    tmp1 = _mm256_max_ps(tmp1, zeros);                                  \
+    _mm256_storeu_ps(y, tmp0);                                          \
+    _mm256_storeu_ps(y + this->rest_, tmp1);                            \
+  }
+
+#define INTRI_GT16_FLOAT(isa)                                                \
+  template <>                                                                \
+  VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d)                 \
+      : VReluKernel<float>() {                                               \
+    this->num_ = d;                                                          \
+    this->end_ = d - d % AVX_FLOAT_BLOCK;                                    \
+    this->rest_ = d - AVX_FLOAT_BLOCK;                                       \
+  }                                                                          \
+  template <>                                                                \
+  void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
+      const {                                                                \
+    __m256 zeros = _mm256_setzero_ps();                                      \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
+      tmp = _mm256_max_ps(tmp, zeros);                                       \
+      _mm256_storeu_ps(y + i, tmp);                                          \
+    }                                                                        \
+    __m256 tmp = _mm256_loadu_ps(x + this->rest_);                           \
+    tmp = _mm256_max_ps(tmp, zeros);                                         \
+    _mm256_storeu_ps(y + this->rest_, tmp);                                  \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+INTRI_GT8LT16_FLOAT(jit::avx);
+INTRI_GT16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+INTRI_GT8LT16_FLOAT(jit::avx2);
+INTRI_GT16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+// TODO(TJ): refine avx512
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+INTRI_GT8LT16_FLOAT(jit::avx512f);
+INTRI_GT16_FLOAT(jit::avx512f);
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+
+/* An empty JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VIdentityKernelImpl : public VIdentityKernel<T> {
+ public:
+  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, T* y) const override {}
+};
+
+REGISTER_JITKERNEL(vmul, VMulKernel);
+REGISTER_JITKERNEL(vadd, VAddKernel);
+REGISTER_JITKERNEL(vscal, VScalKernel);
+REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
+REGISTER_JITKERNEL(vrelu, VReluKernel);
+REGISTER_JITKERNEL(videntity, VIdentityKernel);
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b62e130c43743f542e2074868fc01598047d6b19
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -0,0 +1,400 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <cmath>  // for exp
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#ifdef __AVX__
+namespace detail {
+__m256 Exp(__m256 a);
+}  // namespace detail
+#endif
+
+namespace jitkernel {
+namespace jit = platform::jit;
+
+/* VExp JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class VExpKernelImpl : public VExpKernel<T> {
+ public:
+  explicit VExpKernelImpl(int d) : VExpKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = std::exp(x[i]);
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                                               \
+  template <>                                                               \
+  void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
+      const {                                                               \
+    platform::dynload::vsExp(this->num_, x, y);                             \
+  }
+
+#define MKL_DOUBLE(isa, block)                                                 \
+  template <>                                                                  \
+  void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
+      const {                                                                  \
+    platform::dynload::vdExp(this->num_, x, y);                                \
+  }
+FOR_EACH_ISA(MKL_FLOAT, kLT8);
+FOR_EACH_ISA(MKL_FLOAT, kGT8LT16);
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                                                  \
+  template <>                                                              \
+  void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                              \
+    __m256 tmp = _mm256_loadu_ps(x);                                       \
+    _mm256_storeu_ps(y, detail::Exp(tmp));                                 \
+  }
+
+#define INTRI16_FLOAT(isa)                                                  \
+  template <>                                                               \
+  void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+      const {                                                               \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                       \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                   \
+    tmp0 = detail::Exp(tmp0);                                               \
+    tmp1 = detail::Exp(tmp1);                                               \
+    _mm256_storeu_ps(y, tmp0);                                              \
+    _mm256_storeu_ps(y + 8, tmp1);                                          \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+REGISTER_JITKERNEL(vexp, VExpKernel);
+
+/* VSigmoid JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class VSigmoidKernelImpl : public VSigmoidKernel<T> {
+ public:
+  explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
+    this->num_ = d;
+    vexp_ = KernelPool::Instance().template Get<VExpKernel<T>>(d);
+  }
+  void Compute(const T* x, T* y) const override {
+    const T min = SIGMOID_THRESHOLD_MIN;
+    const T max = SIGMOID_THRESHOLD_MAX;
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+      y[i] = static_cast<T>(0) - y[i];
+    }
+    vexp_->Compute(y, y);
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+    }
+  }
+
+ private:
+  std::shared_ptr<const VExpKernel<T>> vexp_;
+};
+
+#define INTRI_SIGMOID(tmp, min, max)              \
+  tmp = _mm256_max_ps(tmp, min);                  \
+  tmp = _mm256_min_ps(tmp, max);                  \
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
+  tmp = detail::Exp(tmp);                         \
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
+
+#define INTRI8_FLOAT(isa)                                                      \
+  template <>                                                                  \
+  void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                                  \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                        \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                        \
+    __m256 tmp = _mm256_loadu_ps(x);                                           \
+    INTRI_SIGMOID(tmp, min, max);                                              \
+    _mm256_storeu_ps(y, tmp);                                                  \
+  }
+
+#define INTRI16_FLOAT(isa)                                              \
+  template <>                                                           \
+  void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x,   \
+                                                      float* y) const { \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                 \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                 \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                               \
+    INTRI_SIGMOID(tmp0, min, max);                                      \
+    INTRI_SIGMOID(tmp1, min, max);                                      \
+    _mm256_storeu_ps(y, tmp0);                                          \
+    _mm256_storeu_ps(y + 8, tmp1);                                      \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa)                                             \
+  template <>                                                                \
+  VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d)        \
+      : VSigmoidKernel<float>() {                                            \
+    this->num_ = d;                                                          \
+    this->end_ = AVX_FLOAT_BLOCK;                                            \
+    this->rest_ = d - this->end_;                                            \
+    vexp_ =                                                                  \
+        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
+  }                                                                          \
+  template <>                                                                \
+  void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,     \
+                                                         float* y) const {   \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
+    __m256 tmp = _mm256_loadu_ps(x);                                         \
+    INTRI_SIGMOID(tmp, min, max);                                            \
+    _mm256_storeu_ps(y, tmp);                                                \
+    const float min_ = SIGMOID_THRESHOLD_MIN;                                \
+    const float max_ = SIGMOID_THRESHOLD_MAX;                                \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
+      y[i] = 0.f - y[i];                                                     \
+    }                                                                        \
+    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = 1.f / (1.f + y[i]);                                             \
+    }                                                                        \
+  }
+
+#define INTRI_GT16_FLOAT(isa)                                                \
+  template <>                                                                \
+  VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d)           \
+      : VSigmoidKernel<float>() {                                            \
+    this->num_ = d;                                                          \
+    this->rest_ = d % AVX_FLOAT_BLOCK;                                       \
+    this->end_ = d - this->rest_;                                            \
+    vexp_ =                                                                  \
+        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
+  }                                                                          \
+  template <>                                                                \
+  void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x,        \
+                                                      float* y) const {      \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
+      INTRI_SIGMOID(tmp, min, max);                                          \
+      _mm256_storeu_ps(y + i, tmp);                                          \
+    }                                                                        \
+    const float min_ = SIGMOID_THRESHOLD_MIN;                                \
+    const float max_ = SIGMOID_THRESHOLD_MAX;                                \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
+      y[i] = 0.f - y[i];                                                     \
+    }                                                                        \
+    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = 1.f / (1.f + y[i]);                                             \
+    }                                                                        \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+INTRI_GT8LT16_FLOAT(jit::avx);
+INTRI_GT16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+// INTRI_GT8LT16_FLOAT(jit::avx2);
+// INTRI_GT16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+// INTRI_GT8LT16_FLOAT(jit::avx512f);
+// INTRI_GT16_FLOAT(jit::avx512f);
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+#undef INTRI_VSIGMOID
+
+REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
+
+/* VTanh JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class VTanhKernelImpl : public VTanhKernel<T> {
+ public:
+  explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
+    this->num_ = d;
+    vscal_ = KernelPool::Instance().template Get<VScalKernel<T>>(d);
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<T>>(d);
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
+  }
+  void Compute(const T* x, T* y) const override {
+    vscal_->Compute(static_cast<T>(2), x, y);
+    vsigmoid_->Compute(y, y);
+    vscal_->Compute(static_cast<T>(2), y);
+    vaddbias_->Compute(static_cast<T>(-1), y, y);
+  }
+
+ private:
+  std::shared_ptr<const VScalKernel<T>> vscal_;
+  std::shared_ptr<const VSigmoidKernel<T>> vsigmoid_;
+  std::shared_ptr<const VAddBiasKernel<T>> vaddbias_;
+};
+
+#define INTRI_VTANH(tmp)                                   \
+  tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp);         \
+  tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
+  tmp = detail::Exp(tmp);                                  \
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);          \
+  tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp);          \
+  tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
+
+#define INTRI8_FLOAT(isa)                                                   \
+  template <>                                                               \
+  void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                               \
+    __m256 tmp = _mm256_loadu_ps(x);                                        \
+    INTRI_VTANH(tmp);                                                       \
+    _mm256_storeu_ps(y, tmp);                                               \
+  }
+
+#define INTRI16_FLOAT(isa)                                                   \
+  template <>                                                                \
+  void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+      const {                                                                \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
+    INTRI_VTANH(tmp0);                                                       \
+    INTRI_VTANH(tmp1);                                                       \
+    _mm256_storeu_ps(y, tmp0);                                               \
+    _mm256_storeu_ps(y + 8, tmp1);                                           \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa)                                              \
+  template <>                                                                 \
+  VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d)               \
+      : VTanhKernel<float>() {                                                \
+    this->num_ = d;                                                           \
+    this->end_ = AVX_FLOAT_BLOCK;                                             \
+    this->rest_ = d - this->end_;                                             \
+    vscal_ =                                                                  \
+        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
+        this->rest_);                                                         \
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
+        this->rest_);                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,         \
+                                                      float* y) const {       \
+    __m256 tmp = _mm256_loadu_ps(x);                                          \
+    INTRI_VTANH(tmp);                                                         \
+    _mm256_storeu_ps(y, tmp);                                                 \
+    x += AVX_FLOAT_BLOCK;                                                     \
+    y += AVX_FLOAT_BLOCK;                                                     \
+    vscal_->Compute(2.f, x, y);                                               \
+    vsigmoid_->Compute(y, y);                                                 \
+    vscal_->Compute(2.f, y);                                                  \
+    vaddbias_->Compute(-1.f, y, y);                                           \
+  }
+
+#define INTRI_GT16_FLOAT(isa)                                                 \
+  template <>                                                                 \
+  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                  \
+      : VTanhKernel<float>() {                                                \
+    this->num_ = d;                                                           \
+    this->rest_ = d % AVX_FLOAT_BLOCK;                                        \
+    this->end_ = d - this->rest_;                                             \
+    vscal_ =                                                                  \
+        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
+        this->rest_);                                                         \
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
+        this->rest_);                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y)  \
+      const {                                                                 \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                   \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                    \
+      INTRI_VTANH(tmp);                                                       \
+      _mm256_storeu_ps(y + i, tmp);                                           \
+    }                                                                         \
+    x += this->end_;                                                          \
+    y += this->end_;                                                          \
+    vscal_->Compute(2.f, x, y);                                               \
+    vsigmoid_->Compute(y, y);                                                 \
+    vscal_->Compute(2.f, y);                                                  \
+    vaddbias_->Compute(-1.f, y, y);                                           \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+INTRI_GT8LT16_FLOAT(jit::avx);
+INTRI_GT16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+// maybe use avx at gt8lt16 and gt16
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+// maybe use avx at gt8lt16 and gt16
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+#undef INTRI_VTANH
+
+REGISTER_JITKERNEL(vtanh, VTanhKernel);
+
+#undef JITKERNEL_NEW_ACT_IMPL
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42a2b96fd945c516f8c26ca51ecb452345a9a86f
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc
@@ -0,0 +1,308 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+#ifdef __AVX__
+namespace detail {
+__m256 Exp(__m256 a);
+}  // namespace detail
+#endif
+
+namespace jitkernel {
+namespace jit = platform::jit;
+
+#ifdef __AVX__
+typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type;
+
+class AVXAct {
+ public:
+  virtual ~AVXAct() = default;
+  virtual __m256 Compute(__m256 x) const = 0;
+};
+
+template <act_type type>
+class AVXActImpl : public AVXAct {
+ public:
+  __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
+};
+
+template <>
+__m256 AVXActImpl<kSigmoid>::Compute(__m256 x) const {
+  __m256 ones = _mm256_set1_ps(1.0f);
+  x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN));
+  x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX));
+  x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x);
+  x = detail::Exp(x);
+  x = _mm256_add_ps(ones, x);
+  return _mm256_div_ps(ones, x);
+}
+
+template <>
+__m256 AVXActImpl<kTanh>::Compute(__m256 x) const {
+  __m256 ones = _mm256_set1_ps(1.0f);
+  x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x);
+  x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT));
+  x = detail::Exp(x);
+  x = _mm256_add_ps(ones, x);
+  x = _mm256_div_ps(_mm256_set1_ps(2.0f), x);
+  return _mm256_sub_ps(x, ones);
+}
+
+template <>
+__m256 AVXActImpl<kRelu>::Compute(__m256 x) const {
+  return _mm256_max_ps(x, _mm256_setzero_ps());
+}
+
+template <>
+__m256 AVXActImpl<kIdentity>::Compute(__m256 x) const {
+  return x;
+}
+#endif
+
+template <typename T>
+static std::shared_ptr<const VActKernel<T>> GetActKernel(
+    const std::string& type, int n) {
+  if (type == "sigmoid") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
+  } else if (type == "relu") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VReluKernel<T>>(n));
+  } else if (type == "tanh") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VTanhKernel<T>>(n));
+  } else if (type == "identity" || type == "") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+
+/* LSTM JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class LSTMKernelImpl : public LSTMKernel<T> {
+ public:
+  explicit LSTMKernelImpl(const std::string& act_gate,
+                          const std::string& act_cand,
+                          const std::string& act_cell, int d)
+      : LSTMKernel<T>() {
+    d_ = d;
+    d2_ = d * 2;
+    d3_ = d * 3;
+    act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
+    act_cand_d_ = GetActKernel<T>(act_cand, d);
+    act_cell_d_ = GetActKernel<T>(act_cell, d);
+    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
+    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+#ifdef __AVX__
+    auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr<AVXAct> {
+      if (type == "sigmoid") {
+        return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid>());
+      } else if (type == "relu") {
+        return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu>());
+      } else if (type == "tanh") {
+        return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh>());
+      } else if (type == "identity" || type == "") {
+        return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity>());
+      }
+      PADDLE_THROW("Not support type: %s", type);
+    };
+    avx_act_gate_ = GetAVXAct(act_gate);
+    avx_act_cand_ = GetAVXAct(act_cand);
+    avx_act_cell_ = GetAVXAct(act_cell);
+#endif
+  }
+
+  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
+                   T* checked) const override {
+    // gates: W_ch, W_ih, W_fh, W_oh
+    act_gate_d3_->Compute(gates + d_, gates + d_);
+
+    /* C_t = C_t-1 * fgated + cand_gated * igated */
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, gates + d_);
+    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
+    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+
+    /* H_t = act_cell(C_t) * ogated */
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
+    /* C_t = igated * cgated*/
+    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, ct);
+    /* H_t = act_cell(C_t) * ogated */
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+
+ private:
+  int d_, d2_, d3_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
+      act_cell_d_;
+  std::shared_ptr<const VMulKernel<T>> vmul_d_;
+  std::shared_ptr<const VAddKernel<T>> vadd_d_;
+#ifdef __AVX__
+  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_cand_, avx_act_cell_;
+#endif
+};
+
+#define INTRI8_FLOAT(isa)                                                    \
+  template <>                                                                \
+  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
+      float* gates, const float* ct_1, float* ct, float* ht,                 \
+      const float* wp_data, float* checked) const {                          \
+    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
+    __m256 c, i, f, o;                                                       \
+    c = _mm256_loadu_ps(gates);                                              \
+    i = _mm256_loadu_ps(gates + 8);                                          \
+    f = _mm256_loadu_ps(gates + 16);                                         \
+    o = _mm256_loadu_ps(gates + 24);                                         \
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/                          \
+    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
+    i = _mm256_loadu_ps(ct_1);                                               \
+    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                         \
+    f = _mm256_add_ps(c, f);                                                 \
+    _mm256_storeu_ps(ct, f);                                                 \
+    /* H_t = act_cell(C_t) * ogated */                                       \
+    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
+    _mm256_storeu_ps(ht, o);                                                 \
+  }
+
+// TODO(TJ): optimize keq16
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+
+/* Peephole JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class PeepholeKernelImpl : public LSTMKernel<T> {
+ public:
+  explicit PeepholeKernelImpl(const std::string& act_gate,
+                              const std::string& act_cand,
+                              const std::string& act_cell, int d)
+      : LSTMKernel<T>() {
+    d_ = d;
+    d2_ = d * 2;
+    d3_ = d * 3;
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
+    act_cand_d_ = GetActKernel<T>(act_cand, d);
+    act_cell_d_ = GetActKernel<T>(act_cell, d);
+    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
+    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+    vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
+    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
+  }
+
+  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
+                   T* checked) const override {
+    /* get fgated and igated*/
+    vmul_d_->Compute(wp_data, ct_1, checked);
+    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_);
+    vadd_d2_->Compute(checked, gates + d_, gates + d_);
+    act_gate_d2_->Compute(gates + d_, gates + d_);
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, gates + d_);
+    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
+    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+    /* get ogated*/
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    /* H_t = act_cell(C_t) * ogated */
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+
+  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
+    /* C_t = igated * cgated*/
+    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, ct);
+    /* get outgated, put W_oc * C_t on igated */
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    /* H_t = act_cell(C_t) * ogated */
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+
+ private:
+  int d_, d2_, d3_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
+      act_cell_d_;
+  std::shared_ptr<const VMulKernel<T>> vmul_d_;
+  std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
+};
+
+#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)                  \
+  template <>                                                         \
+  std::shared_ptr<const LSTMKernel<ker_dtype>>                        \
+  KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&,          \
+                  const std::string&, const std::string&, int, bool>( \
+      const std::string& act_gate, const std::string& act_cand,       \
+      const std::string& act_cell, int d, bool use_peephole)
+
+#define JITKERNEL_KEY_LSTM(ker_key, dtype_key)                               \
+  #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
+                                       (use_peephole ? "p" : "n")
+
+#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k)                    \
+  if (use_peephole) {                                                  \
+    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
+        std::make_shared<PeepholeKernelImpl<dtype, isa, k>>(           \
+            act_gate, act_cand, act_cell, d));                         \
+  } else {                                                             \
+    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
+        std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
+                                                   act_cell, d));      \
+  }
+
+REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
+                        JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);
+
+#undef INTRI8_FLOAT
+#undef JITKERNEL_DECLARE_LSTM
+#undef JITKERNEL_KEY_LSTM
+#undef JITKERNEL_NEW_LSTM_IMPL
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8e55f2673560ff6afa34376b73275b57a8ceea1
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+#define SEARCH_BLOCK(macro_, ker, dtype, isa)                 \
+  if (d < AVX_FLOAT_BLOCK) {                                  \
+    macro_(ker, dtype, isa, kLT8);                            \
+  } else if (d == AVX_FLOAT_BLOCK) {                          \
+    macro_(ker, dtype, isa, kEQ8);                            \
+  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
+    macro_(ker, dtype, isa, kGT8LT16);                        \
+  } else if (d == AVX512_FLOAT_BLOCK) {                       \
+    macro_(ker, dtype, isa, kEQ16);                           \
+  } else {                                                    \
+    macro_(ker, dtype, isa, kGT16);                           \
+  }
+
+#define SEARCH_ISA_BLOCK(macro_, ker, dtype)        \
+  if (jit::MayIUse(jit::avx512f)) {                 \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
+  } else if (jit::MayIUse(jit::avx2)) {             \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::avx2);    \
+  } else if (jit::MayIUse(jit::avx)) {              \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::avx);     \
+  } else {                                          \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
+  }
+
+#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
+  template <>                                   \
+  std::shared_ptr<const ker_class<ker_dtype>>   \
+  KernelPool::Get<ker_class<ker_dtype>, int>(int d)
+
+#define JITKERNEL_KEY(ker_key, dtype_key) \
+  #ker_key #dtype_key + std::to_string(d)
+
+#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
+  p = std::dynamic_pointer_cast<ker<dtype>>(   \
+      std::make_shared<ker##Impl<dtype, isa, k>>(d))
+
+#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
+                             marco_declare, macro_key, macro_impl)     \
+  marco_declare(ker_class, ker_dtype) {                                \
+    std::string key = macro_key(ker_key, dtype_key);                   \
+    if (kers_.find(key) == kers_.end()) {                              \
+      std::shared_ptr<ker_class<ker_dtype>> p;                         \
+      SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype);              \
+      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});       \
+      return p;                                                        \
+    }                                                                  \
+    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(      \
+        kers_.at(key));                                                \
+  }
+
+#define REGISTER_JITKERNEL(ker_key, ker_class)                           \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE,  \
+                       JITKERNEL_KEY, JITKERNEL_NEW_IMPL);               \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
+                       JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
+
+#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key,  \
+                                macro_impl)                                    \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
+                       macro_impl);                                            \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare,           \
+                       macro_key, macro_impl)
+
+#define FOR_EACH_ISA(macro_, block) \
+  macro_(jit::avx512f, block);      \
+  macro_(jit::avx2, block);         \
+  macro_(jit::avx, block);          \
+  macro_(jit::isa_any, block)
+
+#define FOR_EACH_BLOCK(macro_, isa) \
+  macro_(isa, kLT8);                \
+  macro_(isa, kEQ8);                \
+  macro_(isa, kGT8LT16);            \
+  macro_(isa, kEQ16);               \
+  macro_(isa, kGT16)
+
+#define FOR_EACH_ISA_BLOCK(macro_)      \
+  FOR_EACH_BLOCK(macro_, jit::avx512f); \
+  FOR_EACH_BLOCK(macro_, jit::avx2);    \
+  FOR_EACH_BLOCK(macro_, jit::avx);     \
+  FOR_EACH_BLOCK(macro_, jit::isa_any)
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26590171bbeaa385ac09b04e5faf483924176598
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -0,0 +1,749 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <sys/time.h>
+#include <cmath>    // for exp
+#include <cstring>  // for memcpy
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+constexpr int repeat = 20000;
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+template <typename T>
+void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+               const T upper = static_cast<T>(20.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+void vrelu_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0.f ? x[i] : 0.f;
+  }
+}
+
+#if defined __AVX__ || defined __AVX2__
+void vrelu_intri8(const int n, const float* x, float* y) {
+  __m256 tmp = _mm256_loadu_ps(x);
+  tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());
+  _mm256_storeu_ps(y, tmp);
+}
+#endif
+
+TEST(JitKernel, vrelu) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -10.f, 1.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vrelu_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vrelu_intri8(d, x_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+    }
+#endif
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vaddbias_ref(const int n, const float a, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+TEST(JitKernel, vaddbias) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
+    const float a = 2.f;
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vaddbias_ref(d, a, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(a, x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vexp_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+void vexp_mkl(const int n, const float* x, float* y) {
+  paddle::platform::dynload::vsExp(n, x, y);
+}
+#endif
+
+TEST(JitKernel, vexp) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vexp_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vexp_mkl(d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+inline float _sigmoid(float x) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  float tmp = (x < min) ? min : ((x > max) ? max : x);
+  return 1.f / (1.f + std::exp(-tmp));
+}
+
+void vsigmoid_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _sigmoid(x[i]);
+  }
+}
+
+void vsigmoid_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
+    const int n, const float* x, float* y) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 0.f - y[i];
+  }
+  vexp->Compute(y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+}
+
+TEST(JitKernel, vsigmoid) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
+    const auto& vexp =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vsigmoid_better(vexp, d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vsigmoid_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; }
+
+void vtanh_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _tanh(x[i]);
+  }
+}
+
+void vtanh_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
+        vsigmoid,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VAddBiasKernel<float>>&
+        vaddbias,
+    const int n, const float* x, float* y) {
+  vscal->Compute(2.f, x, y);
+  vsigmoid->Compute(y, y);
+  vscal->Compute(2.f, y);
+  vaddbias->Compute(-1.f, y, y);
+}
+
+TEST(JitKernel, vtanh) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
+    const auto& vscal =
+        jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
+    const auto& vsigmoid =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
+    const auto& vaddbias =
+        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vtanh_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void lstm_ctht_ref(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
+        vsigmoid_3d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
+    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
+  vsigmoid_3d->Compute(gates + d, gates + d);
+  vtanh_d->Compute(gates, gates);
+  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int k = 0; k < d; ++k) {
+    // C_t = C_t-1 * fgated + cand_gated * igated
+    ct[k] = ct_1[k] * f[k] + gates[k] * i[k];
+    // H_t = act_cell(C_t) * ogated
+    float tmp = ct[k] * 2;
+    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
+    vexp_1->Compute(&tmp, &tmp);
+    tmp = 2.f / (1.f + tmp) - 1.f;
+    ht[k] = tmp * o[k];
+  }
+}
+
+void lstm_ctht_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
+        vsigmoid_3d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VMulKernel<float>>& vmul_d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
+    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
+  int d2 = d * 2;
+  vsigmoid_3d->Compute(gates + d, gates + d);
+  vtanh_d->Compute(gates, gates);
+  vmul_d->Compute(gates, gates + d, gates + d);
+  vmul_d->Compute(ct_1, gates + d2, gates + d2);
+  vadd_d->Compute(gates + d, gates + d2, ct);
+  /* H_t = act_cell(C_t) * ogated */
+  vtanh_d->Compute(ct, gates + d2);
+  vmul_d->Compute(gates + d2, gates + d * 3, ht);
+}
+
+TEST(JitKernel, lstm) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) {
+    int d4 = d * 4;
+    int d3 = d * 3;
+    std::vector<float> x(d4), xref(d4);
+    std::vector<float> ct_1(d), ct_tgt(d), ht_tgt(d);
+    std::vector<float> ct_ref(d), ht_ref(d);
+    RandomVec<float>(d4, x.data(), -2.f, 2.f);
+    RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
+    memcpy(xref.data(), x.data(), sizeof(float) * d4);
+    std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+    const auto& ker =
+        jit::KernelPool::Instance()
+            .template Get<jit::LSTMKernel<float>, const std::string&,
+                          const std::string&, const std::string&>(
+                act_gate, act_cand, act_cell, d, false);
+    // below kernels are used to compute refer
+    const auto& vsigmoid_3d =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
+            d3);
+    const auto& vtanh_d =
+        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
+    const auto& vexp_1 =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(1);
+    const auto& vmul_d =
+        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
+    const auto& vadd_d =
+        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
+
+    float* x_data = x.data();
+    float* xref_data = xref.data();
+    const float* ct_1_data = ct_1.data();
+    float* ct_tgt_data = ct_tgt.data();
+    float* ht_tgt_data = ht_tgt.data();
+    float* ct_ref_data = ct_ref.data();
+    float* ht_ref_data = ht_ref.data();
+    // compute once to check correctness
+    lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
+                  ct_ref_data, ht_ref_data);
+    ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
+      EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
+    }
+
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data,
+                       ct_1_data, ct_ref_data, ht_ref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
+                    ct_ref_data, ht_ref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+  }
+}
+
+void vscal_ref(const int n, const float a, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+void vscal_inp_ref(const int n, const float a, float* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+#if defined __AVX__ || defined __AVX2__
+void vscal_intri8(const int n, const float a, const float* x, float* y) {
+  __m256 tmp;
+  __m256 scalar = _mm256_set1_ps(a);
+  tmp = _mm256_loadu_ps(x);
+  tmp = _mm256_mul_ps(tmp, scalar);
+  _mm256_storeu_ps(y, tmp);
+}
+void vscal_inp_intri8(const int n, const float a, float* x) {
+  __m256 tmp;
+  __m256 scalar = _mm256_set1_ps(a);
+  tmp = _mm256_loadu_ps(x);
+  tmp = _mm256_mul_ps(tmp, scalar);
+  _mm256_storeu_ps(x, tmp);
+}
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+void vscal_inp_mkl(const int n, const float a, float* x) {
+  paddle::platform::dynload::cblas_sscal(n, a, x, 1);
+}
+#endif
+
+TEST(JitKernel, vscal) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    std::memcpy(y.data(), x.data(), sizeof(float) * d);
+    float a = 2.f;
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
+    const float* x_data = x.data();
+    float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vscal_ref(d, a, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto trefs1 = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vscal_inp_ref(d, a, y_data);
+    }
+    auto trefe1 = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vscal_inp_mkl(d, a, y_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vscal_intri8(d, a, x_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      auto si2 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vscal_inp_intri8(d, a, y_data);
+      }
+      auto si3 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
+              << " us, inplace: " << (si3 - si2) / repeat;
+    }
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(a, x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    auto ttgts1 = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(a, y_data);
+    }
+    auto ttgte1 = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, inplace takes: " << (trefe1 - trefs1) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat
+            << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vmul_ref(const int n, const float* x, const float* y, float* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+#if defined __AVX__ || defined __AVX2__
+void vmul_intri8(const int n, const float* x, const float* y, float* z) {
+  __m256 tmpx, tmpy;
+  tmpx = _mm256_loadu_ps(x);
+  tmpy = _mm256_loadu_ps(y);
+  tmpx = _mm256_mul_ps(tmpx, tmpy);
+  _mm256_storeu_ps(z, tmpx);
+}
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+void vmul_mkl(const int n, const float* x, const float* y, float* z) {
+  paddle::platform::dynload::vsMul(n, x, y, z);
+}
+#endif
+
+TEST(JitKernel, vmul) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    RandomVec<float>(d, y.data());
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
+    const float* x_data = x.data();
+    const float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vmul_ref(d, x_data, y_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vmul_mkl(d, x_data, y_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vmul_intri8(d, x_data, y_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+    }
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, y_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vadd_ref(const int n, const float* x, const float* y, float* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+#if defined __AVX__ || defined __AVX2__
+void vadd_intri8(const int n, const float* x, const float* y, float* z) {
+  __m256 tmpx, tmpy;
+  tmpx = _mm256_loadu_ps(x);
+  tmpy = _mm256_loadu_ps(y);
+  tmpx = _mm256_add_ps(tmpx, tmpy);
+  _mm256_storeu_ps(z, tmpx);
+}
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+void vadd_mkl(const int n, const float* x, const float* y, float* z) {
+  paddle::platform::dynload::vsAdd(n, x, y, z);
+}
+#endif
+
+TEST(JitKernel, vadd) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    RandomVec<float>(d, y.data());
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
+    const float* x_data = x.data();
+    const float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vadd_ref(d, x_data, y_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vadd_mkl(d, x_data, y_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vadd_intri8(d, x_data, y_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+    }
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, y_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+TEST(JitKernel, pool) {
+  namespace jit = paddle::operators::math::jitkernel;
+  const int frame_size = 4;
+  std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+  const auto& plstm1 =
+      jit::KernelPool::Instance()
+          .template Get<jit::LSTMKernel<float>, const std::string&,
+                        const std::string&, const std::string&>(
+              act_gate, act_cand, act_cell, frame_size, false);
+  const auto& plstm2 =
+      jit::KernelPool::Instance()
+          .template Get<jit::LSTMKernel<float>, const std::string&,
+                        const std::string&, const std::string&>(
+              act_gate, act_cand, act_cell, frame_size, false);
+  const auto& peephole =
+      jit::KernelPool::Instance()
+          .template Get<jit::LSTMKernel<float>, const std::string&,
+                        const std::string&, const std::string&>(
+              act_gate, act_cand, act_cell, frame_size, true);
+  EXPECT_TRUE(plstm1 != peephole);
+
+  const auto& pvmul_f =
+      jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(4);
+  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(plstm2) !=
+              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f));
+
+  const auto& pvmul_d =
+      jit::KernelPool::Instance().template Get<jit::VMulKernel<double>>(4);
+  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f) !=
+              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));
+
+  const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4");
+  EXPECT_EQ(pvmul_f, pvmul_from_key);
+  const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5");
+  EXPECT_TRUE(pvmul_from_key2 == nullptr);
+}
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 6810c24227ed074d4ba747399348aa12f55467a0..08f57dd45ad76946cbcafb98a3414003ed9d67a9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <map>
 #include <set>
 #include <vector>
 
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
@@ -245,40 +247,42 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
     framework::SelectedRows& out = *output;
-    auto input_rows = input.rows();
-    std::vector<int64_t> merge_rows;
-    merge_rows.reserve(input_rows.size());
-    std::unordered_map<int64_t, size_t> rows_pos_map;
-    rows_pos_map.reserve(input_rows.size());
-    size_t idx = 0u;
-    for (std::vector<int64_t>::iterator iter = input_rows.begin();
-         iter != input_rows.end(); ++iter) {
-      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
-        rows_pos_map[*iter] = idx++;
-        merge_rows.emplace_back(*iter);
-      }
+    std::vector<int64_t> input_rows(input.rows());
+
+    std::map<int64_t, std::vector<int64_t>> merge_row_map;
+    for (size_t i = 0; i < input_rows.size(); ++i) {
+      merge_row_map[input_rows[i]].push_back(i);
     }
 
-    auto input_width = input.value().dims()[1];
-    out.set_rows(merge_rows);
+    std::vector<int64_t> merge_rows(merge_row_map.size());
+    size_t idx = 0;
+    int64_t input_width = input.value().dims()[1];
     out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
+
+    T* out_data = out.mutable_value()->mutable_data<T>(
         framework::make_ddim(
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
-
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_pos_map[input_rows[i]];
-      for (int64_t j = 0; j < input_width; j++) {
-        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+    const T* in_data = input.value().data<T>();
+
+    for (auto& row_pair : merge_row_map) {
+      auto* out_ptr = out_data + idx * input_width;
+      auto& rows = row_pair.second;
+      merge_rows[idx] = row_pair.first;
+      ++idx;
+      // rows.size() is always larger than 0
+      std::memcpy(out_ptr, in_data + rows[0] * input_width,
+                  sizeof(T) * input_width);
+
+      for (size_t i = 1; i < rows.size(); ++i) {
+        auto* in_ptr = in_data + rows[i] * input_width;
+        for (int64_t j = 0; j < input_width; ++j) {
+          out_ptr[j] += in_ptr[j];
+        }
       }
     }
+
+    out.set_rows(merge_rows);
   }
 };
 
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 3d99c9b3f2303c941a150f77516703b8103c6c25..900be86f91c6658a5265189a6745316c6471209e 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <map>
 #include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
@@ -97,41 +98,39 @@ struct MergeAdd<platform::CPUDeviceContext, float> {
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
     framework::SelectedRows& out = *output;
-    auto input_rows = input.rows();
-    std::vector<int64_t> merge_rows;
-    merge_rows.reserve(input_rows.size());
-    std::unordered_map<int64_t, size_t> rows_pos_map;
-    rows_pos_map.reserve(input_rows.size());
-    size_t idx = 0u;
-    for (std::vector<int64_t>::iterator iter = input_rows.begin();
-         iter != input_rows.end(); ++iter) {
-      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
-        rows_pos_map[*iter] = idx++;
-        merge_rows.emplace_back(*iter);
-      }
+    std::vector<int64_t> input_rows(input.rows());
+
+    std::map<int64_t, std::vector<int64_t>> merge_row_map;
+    for (size_t i = 0; i < input_rows.size(); ++i) {
+      merge_row_map[input_rows[i]].push_back(i);
     }
 
-    auto input_width = input.value().dims()[1];
-    out.set_rows(merge_rows);
+    std::vector<int64_t> merge_rows(merge_row_map.size());
+    size_t idx = 0;
+    int64_t input_width = input.value().dims()[1];
     out.set_height(input.height());
-    out.mutable_value()->mutable_data<float>(
+
+    auto* out_data = out.mutable_value()->mutable_data<float>(
         framework::make_ddim(
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, float> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<float>();
-    auto* input_data = input.value().data<float>();
+    auto* in_data = input.value().data<float>();
 
     auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_pos_map[input_rows[i]];
-      float* y = out_data + out_i * input_width;
-      const float* x = input_data + i * input_width;
-      blas.AXPY(input_width, 1., x, y);
+    for (auto& row_pair : merge_row_map) {
+      auto* out_ptr = out_data + idx * input_width;
+      auto& rows = row_pair.second;
+      merge_rows[idx] = row_pair.first;
+      ++idx;
+      // rows.size() is always larger than 0
+      blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
+
+      for (size_t i = 1; i < rows.size(); ++i) {
+        blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
+      }
     }
+
+    out.set_rows(merge_rows);
   }
 };
 
@@ -148,41 +147,39 @@ struct MergeAdd<platform::CPUDeviceContext, double> {
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
     framework::SelectedRows& out = *output;
-    auto input_rows = input.rows();
-    std::vector<int64_t> merge_rows;
-    merge_rows.reserve(input_rows.size());
-    std::unordered_map<int64_t, size_t> rows_pos_map;
-    rows_pos_map.reserve(input_rows.size());
-    size_t idx = 0u;
-    for (std::vector<int64_t>::iterator iter = input_rows.begin();
-         iter != input_rows.end(); ++iter) {
-      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
-        rows_pos_map[*iter] = idx++;
-        merge_rows.emplace_back(*iter);
-      }
+    std::vector<int64_t> input_rows(input.rows());
+
+    std::map<int64_t, std::vector<int64_t>> merge_row_map;
+    for (size_t i = 0; i < input_rows.size(); ++i) {
+      merge_row_map[input_rows[i]].push_back(i);
     }
 
-    auto input_width = input.value().dims()[1];
-    out.set_rows(merge_rows);
+    std::vector<int64_t> merge_rows(merge_row_map.size());
+    size_t idx = 0;
+    int64_t input_width = input.value().dims()[1];
     out.set_height(input.height());
-    out.mutable_value()->mutable_data<double>(
+
+    auto* out_data = out.mutable_value()->mutable_data<double>(
         framework::make_ddim(
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, double> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<double>();
-    auto* input_data = input.value().data<double>();
+    auto* in_data = input.value().data<double>();
 
     auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_pos_map[input_rows[i]];
-      double* y = out_data + out_i * input_width;
-      const double* x = input_data + i * input_width;
-      blas.AXPY(input_width, 1., x, y);
+    for (auto& row_pair : merge_row_map) {
+      auto* out_ptr = out_data + idx * input_width;
+      auto& rows = row_pair.second;
+      merge_rows[idx] = row_pair.first;
+      ++idx;
+      // rows.size() is always larger than 0
+      blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
+
+      for (size_t i = 1; i < rows.size(); ++i) {
+        blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
+      }
     }
+
+    out.set_rows(merge_rows);
   }
 };
 
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index 97c36a83fc5eff421725d05f66fca05f5169d1bb..ab25628d45699dbcfc1fc5792958bae9e42e72a3 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
   }
 };
 
+class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    framework::BlockDesc *sub_block =
+        boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
+    for (auto &out_vars : op_desc.Outputs()) {
+      for (auto &out_var : out_vars.second) {
+        auto &var = block->FindRecursiveOrCreateVar(out_var);
+        auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
+        if (sub_var.GetType() != var.GetType()) {
+          var.SetType(sub_var.GetType());
+        }
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
                   paddle::operators::ParallelDoOpProtoMaker,
                   paddle::operators::ParallelDoGradOpDescMaker);
 REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
-                  paddle::operators::ParallelDoGradOpShapeInference);
+                  paddle::operators::ParallelDoGradOpShapeInference,
+                  paddle::operators::ParallelDoGradOpVarTypeInference);
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc..51b980acb5a08d431d96a3a92479dec09119c27e 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -31,8 +31,8 @@ class BlockingQueue {
   // is a workaround and a simplified version of framework::Channel as it
   // doesn't support GPU and it implements on buffered blocking queue.
  public:
-  explicit BlockingQueue(size_t capacity)
-      : capacity_(capacity), closed_(false) {
+  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
+      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
     PADDLE_ENFORCE_GT(
         capacity_, 0,
         "The capacity of a reader::BlockingQueue must be greater than 0.");
@@ -72,7 +72,9 @@ class BlockingQueue {
     if (!queue_.empty()) {
       PADDLE_ENFORCE_NOT_NULL(elem);
       *elem = queue_.front();
-      queue_.pop_front();
+      if (LIKELY(!speed_test_mode_)) {
+        queue_.pop_front();
+      }
       send_cv_.notify_one();
       return true;
     } else {
@@ -114,6 +116,7 @@ class BlockingQueue {
 
  private:
   size_t capacity_;
+  bool speed_test_mode_;
   bool closed_;
   std::deque<T> queue_;
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 4f7cfc24ec035349f3c85e84d876ad9b5b5493a6..3f041ff7e4e32b407729a22aab25d3aab199fee0 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
 
  private:
   LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims)
-      : queue_(capacity), dims_(dims) {}
+                         const std::vector<framework::DDim>& dims,
+                         bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode), dims_(dims) {}
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
 
 class LoDTensorBlockingQueueHolder {
  public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+                bool speed_test_mode = false) {
     PADDLE_ENFORCE(
         queue_ == nullptr,
         "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
   }
 
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284..bd7ac64b2fce2452744e4756b149ee7f291d38aa 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
   q.Receive(&b);
   EXPECT_EQ(a.val_, b.val_);
 }
+
+TEST(BlockingQueue, speed_test_mode) {
+  size_t queue_size = 10;
+  BlockingQueue<size_t> q1(queue_size, false);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Send(i);
+  }
+  size_t b;
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Receive(&b);
+    EXPECT_EQ(b, i);
+  }
+  EXPECT_EQ(q1.Size(), 0);
+
+  BlockingQueue<size_t> q2(queue_size, true);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Send(i);
+  }
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Receive(&b);
+    EXPECT_EQ(b, 0);
+  }
+  EXPECT_EQ(q2.Size(), queue_size);
+}
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index d72f85f2c44db2fa887732cfc05e1376a6a79e4a..500d86fec33830fc2cfb0412f1f2c7780d08eb02 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in 
+Attr(shape) still should be set correctly to gurantee shape inference in
 compile-time.
 
 )DOC");
@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp {
       : ReshapeOp(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ReshapeOp::InferShape(ctx);
     PADDLE_ENFORCE(ctx->HasOutput("XShape"),
                    "Output(XShape) of ReshapeOp should not be null.");
     const auto &x_dims = ctx->GetInputDim("X");
@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp {
     }
     ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
     ctx->ShareLoD("X", /*->*/ "XShape");
+
+    ReshapeOp::InferShape(ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index a04d1bd2ca5128714d10155a9679d921519cfe07..797cd45fdcdbd5c3567d1676f37e148304ee6e2d 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -13,72 +13,254 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+template <typename T>
+struct DenseRmspropGradFunctor {
+  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
+
+  const T *grad_;
+};
+
+template <typename T>
+struct SparseRmspropGradFunctor {
+  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
+                                  int64_t row_numel, int64_t row_count)
+      : grad_(grad),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const {
+    auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
+    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
+  }
+
+  const T *grad_;
+  const int64_t *rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+};
+
+template <typename T, typename GradFunctor>
+struct UncenteredRmspropFunctor {
+  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
+                           T epsilon, T momentum,
+                           const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
+template <typename T, typename GradFunctor>
+struct CenteredRmspropFunctor {
+  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
+                         T rho, T epsilon, T momentum,
+                         const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        mean_grad_(mean_grad),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
+    T mom_out = momentum_ * mom_[idx] +
+                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+    mean_grad_[idx] = mg_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  T *mean_grad_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
 template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
-
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
-
-    float epsilon = ctx.Attr<float>("epsilon");
-    float rho = ctx.Attr<float>("decay");
-    float momentum = ctx.Attr<float>("momentum");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using LoDTensor = framework::LoDTensor;
+    auto *grad_var = ctx.InputVar("Grad");
+    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
+    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
+
+    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
+    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
     bool centered = ctx.Attr<bool>("centered");
 
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto mom_out = EigenVector<T>::Flatten(*moment_out);
-    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
-
-    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    if (centered) {
-      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
-      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
-      mean_grad_out->mutable_data<T>(ctx.GetPlace());
-      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
-
-      mg_out.device(place) = rho * mg + (1 - rho) * g;
-      mom_out.device(place) = momentum * mom +
-                              lr.broadcast(grad_dsize) * g /
-                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
+    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
+    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
+    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
+
+    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
+                      "Param and ParamOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
+                      "Moment and MomentOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
+                      "MeanSquare and MeanSquareOut must be the same Tensor");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(ms_tensor.numel());
+
+    if (grad_var->IsType<LoDTensor>()) {
+      auto &grad_tensor = grad_var->Get<LoDTensor>();
+
+      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
+        auto &place =
+            *ctx.template device_context<DeviceContext>().eigen_device();
+        auto lr_value = lr_tensor.data<T>()[0];
+
+        auto p = EigenVector<T>::Flatten(p_tensor);
+        auto ms = EigenVector<T>::Flatten(ms_tensor);
+        auto g = EigenVector<T>::Flatten(grad_tensor);
+        auto mom = EigenVector<T>::Flatten(mom_tensor);
+
+        auto p_out = EigenVector<T>::Flatten(*param_out);
+        auto mom_out = EigenVector<T>::Flatten(*moment_out);
+        auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+
+        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto mg = EigenVector<T>::Flatten(mg_tensor);
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+          mg_out.device(place) = rho * mg + (1 - rho) * g;
+          mom_out.device(place) =
+              momentum * mom +
+              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
+        } else {
+          mom_out.device(place) =
+              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
+        }
+        p_out.device(place) = p - mom_out;
+      } else {
+        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
+        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()),
+              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
+              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
+        } else {
+          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+              rho, epsilon, momentum, grad_func));
+        }
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto &grad = grad_var->Get<framework::SelectedRows>();
+      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(dev_ctx, grad, merged_grad);
+
+      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+      const int64_t *rows;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      auto &merged_tensor = merged_grad->value();
+      int64_t row_count = merged_grad->rows().size();
+      int64_t row_numel = merged_tensor.numel() / row_count;
+      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
+                                            row_numel, row_count);
+
+      if (centered) {
+        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+        PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                       "MeanGrad and MeanGradOut must be the same Tensor");
+        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()),
+            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      } else {
+        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      }
     } else {
-      mom_out.device(place) =
-          momentum * mom +
-          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
     }
-    p_out.device(place) = p - mom_out;
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 397a3182953e3f1afaeadeff6d53a4f22fb95d26..3234b60861da3d0c6a8434eb11fd0488a95e171f 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>);
+REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
+                       Kernel<int64_t>);
+
 REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
                   op::SeqConcatGradShapeInferer);
 template <typename T>
 using GradKernel =
     op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
-                       GradKernel<double>);
+                       GradKernel<double>, GradKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_unpad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3a0762b9a4d3e080d5d6d10b249e0bd81980b95
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_unpad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceUnpadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceUnpadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceUnpadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceUnpadOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The rank of Input(X) can't be less than 2.");
+
+    auto len_dims = ctx->GetInputDim("Length");
+    PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1,
+                   "The shape of Input(Length) should be [batch_size, 1].");
+    PADDLE_ENFORCE(
+        len_dims[0] == x_dims[0],
+        "Input(X) and Input(Length) should have the same first dimension.");
+
+    int64_t out_dim_0 = -1;
+    if (ctx->IsRuntime()) {
+      out_dim_0 = x_dims[0] * x_dims[1];
+    }
+
+    std::vector<int64_t> out_dims_vec{out_dim_0};
+    if (x_dims.size() == 2) {
+      out_dims_vec.push_back(1);
+    } else {
+      for (size_t i = 2; i < x_dims.size(); ++i) {
+        out_dims_vec.push_back(x_dims[i]);
+      }
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) Input tensor which "
+             "contains the padded sequences with equal length.");
+    AddInput("Length",
+             "(LoDTensor) The input tensor which specifies the actual ength of "
+             "sequences after unpadding.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) The output tensor which contains unpadded sequences.");
+    AddComment(R"DOC(
+      Sequence Unpad Operator
+
+      This operator removes the padding data in the input sequences and convert 
+      them into sequences with actual length as output, identitied by lod 
+      information.
+
+      Example:
+
+      Given input tensor Input(X):
+          X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+                    [ 6.0,  7.0,  8.0,  9.0, 10.0],
+                    [11.0, 12.0, 13.0, 14.0, 15.0]], 
+`     
+      in which there are 3 sequences padded to length 5, and the acutal length 
+      specified by Input(Length):
+
+          Length.data = [[2], [3], [4]],
+
+      after unpadding, Output(Out) will be:
+
+          Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+          Out.lod = [[0, 2, 5, 9]]      
+
+    )DOC");
+  }
+};
+
+class SequenceUnpadGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceUnpadGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceUnpadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
+                  ops::SequenceUnpadOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_unpad,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_unpad_grad,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_unpad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75248372237ec2cb23122f6b16e64f6ce750ebf9
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_unpad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_unpad,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_unpad_grad,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.h b/paddle/fluid/operators/sequence_unpad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebe3118b985bdfd41ca55e8c572047aa87502ff4
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename DeviceContext, typename T>
+class SequenceUnpadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x_t = ctx.Input<LoDTensor>("X");
+    auto* len_t = ctx.Input<LoDTensor>("Length");
+    auto* out_t = ctx.Output<LoDTensor>("Out");
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    const int64_t* seq_len_ptr = nullptr;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      LoDTensor seq_len_cpu;
+      seq_len_cpu.Resize(len_t->dims());
+      seq_len_ptr = seq_len_cpu.mutable_data<int64_t>(platform::CPUPlace());
+      framework::TensorCopy(*len_t, platform::CPUPlace(),
+                            ctx.template device_context<DeviceContext>(),
+                            &seq_len_cpu);
+    } else {
+      seq_len_ptr = len_t->data<int64_t>();
+    }
+
+    size_t batch_size = x_t->dims()[0];
+    std::vector<size_t> out_lod0(batch_size + 1, 0);
+    for (size_t i = 0; i < batch_size; ++i) {
+      out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+    }
+
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out_t->set_lod(out_lod);
+
+    std::vector<int64_t> out_dims_vec{static_cast<int64_t>(out_lod0.back())};
+    if (x_t->dims().size() == 2) {
+      out_dims_vec.push_back(1);
+    } else {
+      for (size_t i = 2; i < x_t->dims().size(); ++i) {
+        out_dims_vec.push_back(x_t->dims()[i]);
+      }
+    }
+    out_t->Resize(framework::make_ddim(out_dims_vec));
+
+    int64_t padded_length = x_t->dims()[1];
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *x_t, out_t,
+        padded_length, 0, false, math::kBatchLengthWidth);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (d_x) {
+      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+      const auto* x_t = ctx.Input<LoDTensor>("X");
+      d_x->mutable_data<T>(ctx.GetPlace());
+
+      int padded_length = x_t->dims()[1];
+
+      LoDTensor zero_pads;
+      zero_pads.Resize({1, 1});
+      zero_pads.mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
+
+      math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
+          padded_length, 0, false, math::kBatchLengthWidth);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 2880c09263f10e9c624e11b77188171f48d9db28..b5f472d20f40fa182a4aa55ff384b0954e4ba9e3 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
       return cpu.has(Cpu::tAVX);
     case avx2:
       return cpu.has(Cpu::tAVX2);
-    case avx512_common:
+    case avx512f:
       return cpu.has(Cpu::tAVX512F);
     case avx512_core:
       return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 30c8fbcfce92a8b06a175ddf198cde572f72b2a4..6810a1651a14cdb2080af846b21cad242b70bf35 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -43,7 +43,7 @@ typedef enum {
   sse42,
   avx,
   avx2,
-  avx512_common,
+  avx512f,
   avx512_core,
   avx512_core_vnni,
   avx512_mic,
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f04395a8ac00f33501008aa12f22773ddda9b138..a251bfcd9914422cb6300adbbcdef3dfa79f441c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) (condition == 0)
 #endif
 
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition != 0)
+#endif
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 4c99f4be321160caf0ee2f89a655bdfb933408e3..ab91ca5345047f3053eb8771e6a265d2a3011f85 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 
-  if (platform::jit::MayIUse(platform::jit::avx512_common)) {
+  if (platform::jit::MayIUse(platform::jit::avx512f)) {
 #ifndef __AVX512F__
     LOG(WARNING) << "AVX512F is available, Please re-compile on local machine";
 #endif
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 612f3bc0e7940663f84a55b2c4395a7b5119d5bb..a35147da90e87af85308431fd7dbe965bb1fd1d7 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -370,8 +370,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
   std::vector<std::vector<Event>> merged_events_list;
   if (merge_thread) {
     std::vector<Event> merged_events;
-    for (int i = 0; i < events.size(); ++i) {
-      for (int j = 0; j < events[i].size(); ++j) {
+    for (size_t i = 0; i < events.size(); ++i) {
+      for (size_t j = 0; j < events[i].size(); ++j) {
         merged_events.push_back(events[i][j]);
       }
     }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1709546fae288e84ce2b6fef4bfd1bd68da84b1d..339a7c98c6a2bba2cd46790cecc169ef447c63ce 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -57,6 +57,10 @@ limitations under the License. */
 
 #include "pybind11/stl.h"
 
+DEFINE_bool(reader_queue_speed_test_mode, false,
+            "If set true, the queue.pop will only get data from queue but not "
+            "remove the data from queue for speed testing");
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 
@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
                                return make_ddim(shape);
                              });
               auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims);
+              holder->InitOnce(capacity, dims,
+                               FLAGS_reader_queue_speed_test_mode);
               return holder->GetQueue();
             },
         py::return_value_policy::copy);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7bbdf7de89cc932e0023952e3c8e102f92b06855..41678918b8bb54078091f892ce7a519dfc8a0014 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -113,7 +113,8 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
+        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'reader_queue_speed_test_mode'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 43aa4a9e7c4ffe96a8f1346cd1bc3e4f309ba309..224781e6596e2b2bc6d3084f4780a33ed2903118 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -56,6 +56,7 @@ __all__ = [
     'sequence_expand',
     'sequence_expand_as',
     'sequence_pad',
+    'sequence_unpad',
     'lstm_unit',
     'reduce_sum',
     'reduce_mean',
@@ -64,6 +65,7 @@ __all__ = [
     'reduce_prod',
     'sequence_first_step',
     'sequence_last_step',
+    'sequence_slice',
     'dropout',
     'split',
     'ctc_greedy_decoder',
@@ -1902,6 +1904,76 @@ def sequence_last_step(input):
     return sequence_pool(input=input, pool_type="last")
 
 
+def sequence_slice(input, offset, length, name=None):
+    """
+    **Sequence Slice Layer**
+
+    The layer crops a subsequence from given sequence with given start 
+    offset and subsequence length.
+
+    It only supports sequence data (LoDTensor with lod_level equal to 1).
+
+    .. code-block:: text
+    
+	- Case:
+
+            Given the input Variable **input**:
+                
+                input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
+                input.lod = [[3, 2]],
+                input.dims = (5, 2),
+
+            with offset.data = [[0], [1]] and length.data = [[2], [1]],
+
+            the output Variable will be
+                
+                out.data = [[a1, a2], [b1, b2], [e1, e2]],
+                out.lod = [[2, 1]],
+                out.dims = (3, 2).
+	
+    NOTE: The first dimension size of **input**, **offset** and **length** 
+          should be equal. The **offset** should start from 0.
+    
+    Args:
+        input(Variable): The input Variable which consists of the complete 
+                         sequences.
+        offset(Variable): The offset to slice each sequence.
+        length(Variable): The length of each subsequence.
+        name(str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The output subsequences.
+
+    Examples:
+
+        .. code-block:: python
+
+             import numpy as np
+             seqs = fluid.layers.data(name='x', shape=[10, 5],
+                              dtype='float32', lod_level=1)
+             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
+             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
+             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, 
+                                                   length=length)
+    """
+    helper = LayerHelper("sequence_slice", **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+
+    offset.stop_gradient = True
+    length.stop_gradient = True
+
+    helper.append_op(
+        type="sequence_slice",
+        inputs={"X": input,
+                "Offset": offset,
+                "Length": length},
+        outputs={"Out": out})
+
+    return out
+
+
 @templatedoc()
 def pool2d(input,
            pool_size=-1,
@@ -2793,7 +2865,7 @@ def sequence_expand_as(x, y, name=None):
 
 
 @templatedoc()
-def sequence_pad(x, pad_value, maxlen=None):
+def sequence_pad(x, pad_value, maxlen=None, name=None):
     """
     ${comment}
 
@@ -2807,7 +2879,9 @@ def sequence_pad(x, pad_value, maxlen=None):
             None or any positive int. When it is None, all sequences will be
             padded up to the length of the longest one among them; when it a
             certain positive value, it must be greater than the length of the
-            longest original sequence."
+            longest original sequence.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The padded sequence batch and the original lengths before
@@ -2844,6 +2918,66 @@ def sequence_pad(x, pad_value, maxlen=None):
     return out, length
 
 
+def sequence_unpad(x, length, name=None):
+    """
+    **Sequence Unpad Layer**
+
+    This layer removes the padding data in the input sequences and convert 
+    them into sequences with actual length as output, identitied by lod 
+    information.
+
+    .. code-block:: text
+
+	Example:
+
+	Given input Variable **x**:
+	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
+		      [11.0, 12.0, 13.0, 14.0, 15.0]], 
+     
+	in which there are 3 sequences padded to length 5, and the acutal length 
+	specified by input Variable **length**:
+
+	    length.data = [[2], [3], [4]],
+
+	after unpadding, the output Variable will be:
+
+	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+	    out.lod = [[2, 3, 4]]      
+
+    Args:
+        x(Variable): Input Variable which contains the padded sequences with
+            equal length.
+        length(Variable): The Variable that specifies the actual ength of
+            sequences after unpadding.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The Variable contains the unpadded sequences.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32')
+            len = fluid.layers.data(name='length', shape=[1], dtype='int64')
+            out = fluid.layers.sequence_unpad(x=x, length=len)
+    """
+
+    helper = LayerHelper('sequence_unpad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+
+    length.stop_gradient = True
+
+    helper.append_op(
+        type='sequence_unpad',
+        inputs={'X': x,
+                'Length': length},
+        outputs={'Out': out})
+    return out
+
+
 def beam_search(pre_ids,
                 pre_scores,
                 ids,
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index 6456d1b53a129db04ace7ff4413a3d76e922ccde..fac5e037a46715d146e354825f09ee8ccc4f3d70 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -81,7 +81,10 @@ def get_optimizer():
     return optimizer
 
 
-def train_network(batch_size, is_distributed=False, is_sparse=False):
+def train_network(batch_size,
+                  is_distributed=False,
+                  is_sparse=False,
+                  is_self_contained_lr=False):
     # query
     q = fluid.layers.data(
         name="query_ids", shape=[1], dtype="int64", lod_level=1)
@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
     def get_model(self, batch_size=2):
         # Train program
         avg_cost, acc, predict = \
-            train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
+            train_network(batch_size,
+                          bool(int(os.environ["IS_DISTRIBUTED"])),
+                          bool(int(os.environ["IS_SPARSE"])),
+                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
 
         inference_program = fluid.default_main_program().clone()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
index e971f29db42a7c1a2394505a8ece3d2fd6b347e9..11095f23591edc41a82962149a52096fa17cfb93 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=1e-5,
@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=100,
@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=1e-5,
@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=100,
@@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
             need_envs=need_envs)
 
 
+class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '0'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1d8d0b55f0c5d7cffa01a100847bdf48b6d7023d..dc70477ebe1cfbffd207ebb4bbf9d9f39893d79e 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
         print(str(program))
 
+    def test_sequence_unpad(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10, 5], dtype='float32')
+            length = layers.data(name='length', shape=[1], dtype='int64')
+            self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
+        print(str(program))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):
@@ -406,6 +414,19 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_sequence_slice(self):
+        program = Program()
+        with program_guard(program):
+            import numpy as np
+            seqs = layers.data(
+                name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
+            length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
+            out = layers.sequence_slice(
+                input=seqs, offset=offset, length=length)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_lod_reset(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 70848e4e2239e2be160bb0c1a28a5aecd01a87dc..eb12bc741767340a3e7e3580a8b95065d4267693 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -19,33 +19,76 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+
+
+def create_selected_rows_and_tensor(scope, place, height, row_num,
+                                    embedding_size):
+    sr = scope.var("@selected_rows@").get_selected_rows()
+    tensor = scope.var("grad").get_tensor()
+
+    rows = np.random.random_integers(
+        low=0, high=height - 1, size=[row_num, ]).astype('int64')
+    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
+
+    sr.set_height(height)
+    sr.set_rows(rows)
+    sr.get_tensor().set(sr_val, place)
+
+    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
+    for i in range(row_num):
+        row = rows[i]
+        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
+
+    tensor.set(tensor_val, place)
+    return tensor_val, sr_val
 
 
 class TestBase(unittest.TestCase):
-    def setup(self, centered, epsilon=1e-6):
+    def setup(self,
+              place,
+              is_sparse,
+              centered,
+              size,
+              row_num=None,
+              epsilon=1e-6):
         np.random.seed(5)  # fix seed
 
+        self.scope = fluid.global_scope()
+        self.place = place
+
         self.param_name = "param"
-        self.param = np.random.random((123, 321)).astype("float32")
+        self.param = np.random.random(size).astype("float32")
 
         self.mean_square_name = "mean_square"
-        self.mean_square = np.random.random((123, 321)).astype("float32")
+        self.mean_square = np.random.uniform(
+            low=1, high=2, size=size).astype("float32")
 
         self.mean_grad_name = "mean_grad"
-        self.mean_grad = np.random.random((123, 321)).astype("float32")
+        self.mean_grad = np.random.random(size).astype("float32")
 
         self.lr_name = "lr"
         self.learning_rate = np.array([0.01]).astype("float32")
 
         self.grad_name = "grad"
-        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.is_sparse = is_sparse
+        if self.is_sparse:
+            self.grad_sr_name = "@selected_rows@"
+            self.grad, self.grad_sr = create_selected_rows_and_tensor(
+                self.scope, place, size[0], row_num, size[1])
+        else:
+            self.grad = np.random.random(size).astype("float32")
+            grad_tensor = self.scope.var(self.grad_name).get_tensor()
+            grad_tensor.set(self.grad, place)
 
         self.moment_name = "moment"
-        self.moment = np.zeros((123, 321)).astype("float32")
+        self.moment = np.random.uniform(
+            low=0, high=1, size=size).astype("float32")
 
         self.epsilon = epsilon
         self.decay = 0.9
-        self.momentum = 0.0
+        self.momentum = 0.1
         self.centered = centered
 
         self.ms_out = self.decay * self.mean_square + (1 - self.decay
@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
 
         self.param_out = self.param - self.moment_out
 
-    def check(self,
-              actual_t,
-              expect_t,
-              place,
-              out_name,
-              atol=1e-5,
-              equal_nan=False):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-
-
-class TestRmspropOp(TestBase):
-    def check_with_place(self, place, centered, epsilon):
-        self.setup(centered, epsilon)
-        scope = core.Scope()
-
         # create and initialize Param Variable
-        param = scope.var(self.param_name).get_tensor()
-        param.set(self.param, place)
+        self.param_tensor = self.scope.var(self.param_name).get_tensor()
+        self.param_tensor.set(self.param, place)
 
-        mean_square = scope.var(self.mean_square_name).get_tensor()
-        mean_square.set(self.mean_square, place)
+        self.mean_square_tensor = self.scope.var(
+            self.mean_square_name).get_tensor()
+        self.mean_square_tensor.set(self.mean_square, place)
 
-        lr = scope.var(self.lr_name).get_tensor()
+        lr = self.scope.var(self.lr_name).get_tensor()
         lr.set(self.learning_rate, place)
 
-        grad = scope.var(self.grad_name).get_tensor()
-        grad.set(self.grad, place)
+        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
+        self.moment_tensor.set(self.moment, place)
 
-        moment = scope.var(self.moment_name).get_tensor()
-        moment.set(self.moment, place)
+        if self.centered:
+            self.mean_grad_tensor = self.scope.var(
+                self.mean_grad_name).get_tensor()
+            self.mean_grad_tensor.set(self.mean_grad, place)
 
-        # create and run sgd operator
+    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
-        if self.centered:
-            mean_grad = scope.var(self.mean_grad_name).get_tensor()
-            mean_grad.set(self.mean_grad, place)
-
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                MeanGrad=self.mean_grad_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                MeanGradOut=self.mean_grad_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=True)
-        else:
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=False)
-
-        rmsprop_op.run(scope, place)
-
-        atol = 1e-5
-        equal_nan = False
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self,
+                         place,
+                         is_sparse,
+                         centered,
+                         size,
+                         row_num=None,
+                         epsilon=1e-6):
+        self.setup(place, is_sparse, centered, size, row_num, epsilon)
+        self.run_and_check()
+
+    def run_and_check(self):
+        grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
+
+        kwargs = {
+            'Param': self.param_name,
+            'Grad': grad_name,
+            'MeanSquare': self.mean_square_name,
+            'Moment': self.moment_name,
+            'LearningRate': self.lr_name,
+            'ParamOut': self.param_name,
+            'MeanSquareOut': self.mean_square_name,
+            'MomentOut': self.moment_name,
+            'epsilon': self.epsilon,
+            'decay': self.decay,
+            'momentum': self.momentum,
+            'centered': self.centered
+        }
 
         if self.centered:
-            atol = 1e-3
-            equal_nan = True
+            kwargs['MeanGrad'] = self.mean_grad_name
+            kwargs['MeanGradOut'] = self.mean_grad_name
+
+        rmsprop_op = Operator('rmsprop', **kwargs)
+        atol = 1e-6
+
+        rmsprop_op.run(self.scope, self.place)
 
         self.check(
-            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+            np.array(self.mean_square_tensor),
+            self.ms_out,
+            self.place,
+            self.mean_square_name,
+            atol=atol)
         self.check(
-            np.array(moment),
+            np.array(self.moment_tensor),
             self.moment_out,
-            place,
+            self.place,
             self.moment_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
         self.check(
-            np.array(param),
+            np.array(self.param_tensor),
             self.param_out,
-            place,
+            self.place,
             self.param_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
 
         if self.centered:
             self.check(
-                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+                np.array(self.mean_grad_tensor), self.mg_out, self.place,
+                self.mean_grad_name)
 
     def test_rmsprop(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
+
+        size = (128, 320)
         for place in places:
-            self.check_with_place(place, False, 1e-6)
-            self.check_with_place(place, False, 1e-10)
-            self.check_with_place(place, True, 1e-6)
-            self.check_with_place(place, True, 1e-10)
+            for centered in [False, True]:
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place, is_sparse=False, centered=centered, size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=512,
+                        size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=60,
+                        size=size)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..673b0ea180464b8b8f6f5c6e76d5c5c80f347d25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import six
+import numpy as np
+from op_test import OpTest
+
+
+class TestSequenceUnpadOp(OpTest):
+    def init(self):
+        self.length = [2, 3, 4]
+        self.x_shape = (3, 5)
+        self.dtype = "float32"
+
+    def compute(self):
+        assert len(self.length) == self.x_shape[0]
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        out_lod = [self.length]
+
+        out = x[0, 0:self.length[0]]
+        for i in six.moves.xrange(1, x.shape[0]):
+            out = np.append(out, x[i, 0:self.length[i]], axis=0)
+
+        out_shape = (sum(self.length), )
+        if len(self.x_shape) == 2:
+            out_shape = out_shape + (1, )
+        else:
+            out_shape = out_shape + self.x_shape[2:]
+
+        self.inputs = {
+            'X': x,
+            'Length': np.array(self.length).astype('int64').reshape(-1, 1)
+        }
+        self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
+
+    def setUp(self):
+        self.op_type = 'sequence_unpad'
+        self.init()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequenceUnpadOp2(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [2, 3, 4]
+        self.x_shape = (3, 5, 4, 3)
+        self.dtype = "float32"
+
+
+class TestSequenceUnpadOp3(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [5, 2, 3, 4]
+        self.x_shape = (4, 5, 3, 3, 6)
+        self.dtype = "float64"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 91db85b8ec6a32fee3b7aa8ab76429a4a197fcc3..2192139f8d5950286691a77333dd8ec35505b033 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1119,6 +1119,7 @@ to transpile() call.")
 
     def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
         # 2. add split_ids_op and send_op to send gradient to pservers
+
         # there should only be one table_name
         all_ops = program.global_block().ops
         table_grad_name = grad_var_name(self.table_name)
@@ -1143,7 +1144,7 @@ to transpile() call.")
                         if self.sync_mode else []
                     },
                     attrs={
-                        "sync_mode": self.sync_mode,
+                        "sync_mode": not self.sync_mode,
                         "epmap": pserver_endpoints,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME: [
@@ -1189,7 +1190,15 @@ to transpile() call.")
     def _create_table_optimize_block(self, pserver_index, pserver_program,
                                      pre_block_idx, grad_to_block_id):
         # STEP: create table optimize block
+        table_opt_block = pserver_program._create_block(pre_block_idx)
         # create table param and grad var in pserver program
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
+        ][0]
+
         origin_param_var = self.origin_program.global_block().vars[
             self.table_name]
 
@@ -1205,19 +1214,16 @@ to transpile() call.")
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
+
         # parameter must be selected rows
         param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         grad_var = pserver_program.global_block()._clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
                 self.table_name)])
 
-        # create table optimize block in pserver program
-        table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
-        ][0]
-        table_opt_block = pserver_program._create_block(pre_block_idx)
+        lr_var = pserver_program.global_block()._clone_variable(
+            self.origin_program.global_block().vars[table_opt_op.input(
+                "LearningRate")[0]])
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -1249,8 +1255,6 @@ to transpile() call.")
             grad_var = pserver_program.global_block()._rename_var(
                 origin_grad_name, splited_grad_name)
 
-        lr_var = pserver_program.global_block().vars[table_opt_op.input(
-            "LearningRate")[0]]
         inputs = {
             "Param": [param_var],
             "Grad": [grad_var],