Merge pull request #8 from PaddlePaddle/develop

merge to local

Merge pull request #8 from PaddlePaddle/develop
merge to local
5998d3cc · lujun · GitHub · d64abc85 · a661d0bd · 5998d3cc
101 changed file
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -203,7 +203,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    ENDIF()
    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
-    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    SET(PROTOBUF_TAG "v3.6.1")
    ExternalProject_Add(
        ${TARGET_NAME}
@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()
-SET(PROTOBUF_VERSION 3.1)
+SET(PROTOBUF_VERSION 3.6.1)
 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -74,8 +74,8 @@ IF(PYTHONINTERP_FOUND)
    find_python_module(wheel REQUIRED)
    find_python_module(google.protobuf REQUIRED)
    FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.6.1")
-        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.6.1, "
        "please use pip to upgrade protobuf. pip install -U protobuf")
    ENDIF()
 ENDIF(PYTHONINTERP_FOUND)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
 paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
-paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -473,11 +473,11 @@ paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_
 paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None  24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
-paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None
-paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
 paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
 paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
-paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None
 paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+if(WITH_GPU)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
+else()
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
+endif()
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -240,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
        continue;
      }
    }
+    VLOG(3) << "Start Apply Pass " << pass->Type();
    graph = pass->Apply(std::move(graph));
+    VLOG(3) << "Finish Apply Pass " << pass->Type();
  }
  return graph;
 }

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -49,7 +49,7 @@ DEFINE_bool(
    "If this option turns on, only these op in whitelist can be inplaced."
    "If it turns off, all of the running op can be candidate of inplaced op."
    "Such as scale, elementwise_add"
-    "By default, it's turned on");
+    "By default, it's turned off");
 DECLARE_string(memory_optimize_debug);

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include <algorithm>
 #include <deque>
 #include <functional>
-#include <iostream>
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif  // PADDLE_WITH_CUDA
 namespace paddle {
 namespace framework {
@@ -166,6 +172,11 @@ struct NodeComparator {
  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
    auto* lhs_desc = FindVarDescInBlock(lhs);
    auto* rhs_desc = FindVarDescInBlock(rhs);
+    // match data type
+    if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
+      return false;
+    }
+    // match shape
    auto lhs_shape = lhs_desc->GetShape();
    auto rhs_shape = rhs_desc->GetShape();
    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
@@ -230,6 +241,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
  return found_node;
 }
+ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator functor;
+  auto it =
+      std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
+        if (v.front() == prev)
+          return true;
+        else
+          return false;
+      });
+  PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
+  for (it = std::next(it); it != nodes_.end(); ++it) {
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
+      break;
+    }
+  }
+  return found_node;
+}
 bool OrderedSet::Has(ir::Node* var) const {
  if (mark_table_.count(var->Name())) {
    auto& node_in_samename = mark_table_.at(var->Name());
@@ -241,10 +273,15 @@ bool OrderedSet::Has(ir::Node* var) const {
  return false;
 }
+void OrderedSet::Erase(const std::string& var) {
+  PADDLE_ENFORCE(mark_table_.count(var));
+  nodes_.erase(mark_table_[var]);
+  mark_table_.erase(var);
+}
 void OrderedSet::Erase(ir::Node* var) {
-  PADDLE_ENFORCE(mark_table_.count(var->Name()));
+  PADDLE_ENFORCE(var != nullptr);
-  nodes_.erase(mark_table_[var->Name()]);
+  Erase(var->Name());
-  mark_table_.erase(var->Name());
 }
 std::string OrderedSet::ToString() const {
@@ -274,14 +311,35 @@ bool NodeCanReused(ir::Node* node) {
  return flag;
 }
+int MinChunkSize() {
+  int size{0};
+#ifdef PADDLE_WITH_CUDA
+  size = platform::GpuMinChunkSize();
+#else
+  size = platform::CpuMinChunkSize();
+#endif  // PADDLE_WITH_CUDA
+  return size;
+}
 bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
+  // only these types holds bulk of gpu memory
  if (!(type == proto::VarType::LOD_TENSOR ||
        type == proto::VarType::SELECTED_ROWS ||
        type == proto::VarType::LOD_TENSOR_ARRAY)) {
    return false;
  }
-  if (node.Persistable() || node.GetShape().empty()) {
+  // persistable variable is parameter
+  if (node.Persistable()) {
+    return false;
+  }
+  // shape < min_chunk_size is meaningless.
+  // further more, fetched loss always has size = 1
+  // which should not be reused.
+  auto shape = node.GetShape();
+  int size = std::abs(
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
+  if (shape.empty() || size < MinChunkSize()) {
    return false;
  }
  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@@ -461,7 +519,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
  for (auto* node : ops_) {
    if (node == op) break;
    for (auto& output : node->outputs) {
-      if (output->Name() == name) {
+      PADDLE_ENFORCE((output != nullptr && output->IsVar()),
+                     "Output is empty!");
+      if (output->Var() && output->Name() == name) {
        found_node = output;
      }
    }

--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -55,6 +55,7 @@ class OrderedSet {
  void Insert(ir::Node* var);
  void Erase(ir::Node* var);
+  void Erase(const std::string& var);
  bool Has(ir::Node* var) const;
  void Clear() {
    mark_table_.clear();
@@ -62,6 +63,7 @@ class OrderedSet {
  }
  // find the bestfit shape node block with var.
  ir::Node* FindBestFitNode(ir::Node* var) const;
+  ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
  // map store non-const iterator, can not promise const
  int GetNodeIndexInPool(ir::Node* var);
  // pool all node to string

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5);  // match  4:[5,2]
  }
 }
+TEST(OrderedSet, FindBestFitNode) {
+  OrderedSet pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+  {
+    auto desc = block_desc->Var("a");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("b");
+    desc->SetShape({128, 129});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("c");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  for (auto& node : nodes) {
+    pool.Insert(node.get());
+  }
+  // FindNextBestFitNode
+  auto* n = nodes[0].get();
+  auto* cache = pool.FindBestFitNode(n);
+  PADDLE_ENFORCE(cache->Name() == "a");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "c");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "b");
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -69,55 +69,59 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
    }
    for (auto& var : op->outputs) {
-      if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
+      if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
-          skip_set_.count(var->Name()))
+        VLOG(3) << "Skip set contains variable of " << var->Name()
+                << "disable reuse on it. skipped";
        continue;
-      ir::Node* cache = pool_.FindBestFitNode(var);
-      if (var->Name() == FLAGS_memory_optimize_debug) {
-        VLOG(3) << "start match var " << DebugString(var) << " of op "
-                << op->Name();
-        VLOG(3) << pool_.ToString();
-        VLOG(3) << "matched in pool : "
-                << ((cache == nullptr) ? "False" : "True");
      }
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.FindBestFitNode(var);
+        while (cache != nullptr && var->Name() == cache->Name()) {
+          VLOG(3) << "The same cache variable is cascade reused. "
+                  << cache->Name() << " is re-filled to the pool after "
+                  << "the reused op is finished. Current op can not "
+                  << "replace it again. Skip this candidate.";
+          cache = pool_.FindNextBestFitNode(var, cache);
+        }
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
-      if (cache == nullptr) continue;
+        if (cache != nullptr) {
-      if (var->Name() == cache->Name()) {
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
-        VLOG(3) << "The same cache variable is cascade reused." << var->Name()
+          VLOG(3) << string::Sprintf(
-                << " is re-filled to the pool after"
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
-                << "the reused op is finished. Current op can not "
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
-                << "replace it again. Skip this candidate.";
+              node_idx_in_pool, static_cast<int>(pool_.size()));
-        continue;
+          // NOTE(dzhwinter): update the ProgramDesc/IR Graph
+          // and the CFG Graph on the fly.
-        int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+          //
-        VLOG(3) << string::Sprintf(
+          // IR Graph define the dependence relationship between nodes.
-            "!!! %s,  %s => %s, cache idx %d, pool size %d",
+          //
-            std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+          // ProgramDesc defines the input/output vars. Its used in
-            node_idx_in_pool, static_cast<int>(pool_.size()));
+          // CreateOp, CreateVar when running happens.
+          //
-        // update CFG Graph on the fly.
+          // CFG Graph store the liveness information, when reuse happens
-        // reused var maybe re-fill into the pool
+          // we also need to update the variable liveness.
-        cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
+          const std::string var_name = var->Name();
-        // NOTE(dzhwinter): we need to both update the ProgramDesc
+          const std::string cache_name = cache->Name();
-        // and IR Graph. because op_desc/var_desc is used in CreateOp,
-        // CreateVar when running happens. But IR Graph
-        // define the dependence relationship between nodes.
-        RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
-        RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
-        pool_.Erase(cache);
-      }
-      // fill the pool
+          cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
-      std::unordered_set<std::string> unlived_vars;
+          RenameVarInGraphDesc(var_name, cache_name, idx);
-      for (auto var : cfg_->LiveIn(op)) {
+          RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
-        if (cfg_->LiveOut(op).count(var) == 0) {
+          pool_.Erase(cache_name);
-          unlived_vars.emplace(var);
        }
      }
-      for (auto var : unlived_vars) {
+    }
+    // fill the pool
+    for (auto var : cfg_->LiveIn(op)) {
+      if (cfg_->LiveOut(op).count(var) == 0) {
        ir::Node* var_node = cfg_->GetNodeByName(var, op);
+        if (var_node == nullptr || var_node->IsCtrlVar()) continue;
        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
          pool_.Insert(var_node);
        }
@@ -273,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
    // redirect the input to the latest version of cache_var
    for (auto* node : op->inputs) {
      if (node->Name() == var) {
-        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        ir::Node* cache_node = var_nodes_[cache_var].back();
-        var_nodes_[cache_var].emplace_back(cache_node);
        // swap node to cache_node
        cache_node->outputs.insert(cache_node->outputs.end(),
@@ -283,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
        auto* prev_op = node->inputs[0];
        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
                     cache_node);
-        cache_node->inputs.emplace_back(prev_op);
        for (auto* next_op : node->outputs) {
          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                       cache_node);
        }
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
      }
    }
@@ -307,15 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                       cache_node);
        }
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
      }
    }
  }
-  // release node of unused var in graph
-  for (auto* node : var_nodes_[var]) {
-    graph->RemoveNode(node);
-  }
-  var_nodes_.at(var).clear();
 }
 }  // namespace details

--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
  op->SetOutput("Out", {"test2_out"});
  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
  op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
  prog.MutableBlock(0)->Var("o0");
  prog.MutableBlock(0)->Var("y0");
  prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
  prog.MutableBlock(0)->Var("o0");
  prog.MutableBlock(0)->Var("y0");
  prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());

--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
                      ->assert_is_op("scale")
                      ->assert_op_attr<float>("scale", 1.)
                      ->assert_op_attr<float>("bias", 0.);
-  auto scale_out = detector.mutable_pattern()
+  auto scale_out =
-                       ->NewNode("scale_out")
+      detector.mutable_pattern()
-                       ->assert_is_op_output("scale");
+          ->NewNode("scale_out")
+          ->assert_is_op_output("scale")
+          // scale's output var should has only one consumer, or it can't be
+          // removed.
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
  pre_op->LinksTo({scale_in});
  scale_op->LinksFrom({scale_in}).LinksTo({scale_out});

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -207,7 +207,7 @@ framework::LoDTensor& VarBase::GradValue() {
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
  if (grad_op_descs_.empty() && backward_id_ <= 0) {
-    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
+    VLOG(3) << "op with no grad: " << op_desc_->Type();
    return {};
  }

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
  return node.inputs.size() == n;
 }
-NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      if (Agent(p).deleted()) {
-        visited.insert(p);
-        to_visit.erase(p);
-      }
-      inlink_visited.clear();
-      std::copy_if(p->inputs.begin(), p->inputs.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) -> bool { return visited.count(x) != 0; });
-      if (inlink_visited.size() == p->inputs.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outputs) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-Node &NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-NodesTSIterator &NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-bool NodesTSIterator::operator==(const NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-Node *NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -30,6 +30,7 @@ namespace inference {
 namespace analysis {
 using framework::ir::Graph;
+using framework::ir::NodesTSIterator;
 const char kIsFunctionNode[] = "__is_function_node__";
 const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
@@ -132,32 +133,6 @@ struct Agent {
  framework::ir::Node *x_;
 };
-// Topological sorting iterator on nodes.
-struct NodesTSIterator
-    : public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
-  NodesTSIterator() = default;
-  explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
-  NodesTSIterator(NodesTSIterator &&other)
-      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-    other.cursor_ = 0;
-  }
-  NodesTSIterator(const NodesTSIterator &other);
-  framework::ir::Node &operator*();
-  NodesTSIterator &operator++();
-  // TODO(Superjomn) current implementation just compare the first
-  // element, need to compare the graph and all the elements in the queue and
-  // set.
-  NodesTSIterator &operator=(const NodesTSIterator &other);
-  bool operator==(const NodesTSIterator &other);
-  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-  framework::ir::Node *operator->();
- private:
-  std::vector<framework::ir::Node *> sorted_;
-  size_t cursor_{0};
-};
 // The nodes those have no input will be treated as start points.
 static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
  std::vector<framework::ir::Node *> result;

--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                                  comment.type));
    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
                                  comment.type));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
    AddAttr<bool>("force_cpu",
                  "Force fill output variable to cpu "
                  "memory. Otherwise, fill output variable to the running "
@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by $%s$
 )DOC",
                               comment.equation));
-    AddAttr<int>(
-        "axis",
-        "The start dimension index for broadcasting Y onto X. [default -1]")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
  }
 };

--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
-    for (int i = 0; i < fixed_ratios.size(); i++) {
+    for (size_t i = 0; i < fixed_ratios.size(); i++) {
      sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
    }
@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
      }
    }
    if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
+      T* dt = boxes->data<T>();
-      ClipFunctor<T> clip_func;
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+        return std::min<T>(std::max<T>(v, 0.), 1.);
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+      });
-            boxes->data<T>(), clip_func);
    }
    framework::Tensor var_t;
    var_t.mutable_data<T>(
@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #pragma omp parallel for collapse(2)
 #endif
    for (int i = 0; i < box_num; ++i) {
-      for (int j = 0; j < variances.size(); ++j) {
+      for (size_t j = 0; j < variances.size(); ++j) {
        e_vars(i, j) = variances[j];
      }
    }

--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
  }
 }
-template <typename T>
-struct ClipFunctor {
-  HOSTDEVICE inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
 template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
 public:
@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    boxes->mutable_data<T>(ctx.GetPlace());
    vars->mutable_data<T>(ctx.GetPlace());
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    T* b_t = boxes->data<T>();
    for (int h = 0; h < feature_height; ++h) {
      for (int w = 0; w < feature_width; ++w) {
        T center_x = (w + offset) * step_width;
        T center_y = (h + offset) * step_height;
        T box_width, box_height;
-        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          auto min_size = min_sizes[s];
          if (min_max_aspect_ratios_order) {
            box_width = box_height = min_size / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+            b_t[0] = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+            b_t[1] = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+            b_t[2] = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+            b_t[3] = (center_y + box_height) / img_height;
-            idx++;
+            b_t += 4;
            if (max_sizes.size() > 0) {
              auto max_size = max_sizes[s];
              // square prior with size sqrt(minSize * maxSize)
              box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              b_t[0] = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              b_t[1] = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              b_t[2] = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              b_t[3] = (center_y + box_height) / img_height;
-              idx++;
+              b_t += 4;
            }
            // priors with different aspect ratios
            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
              }
              box_width = min_size * sqrt(ar) / 2.;
              box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              b_t[0] = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              b_t[1] = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              b_t[2] = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              b_t[3] = (center_y + box_height) / img_height;
-              idx++;
+              b_t += 4;
            }
          } else {
            // priors with different aspect ratios
@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
              float ar = aspect_ratios[r];
              box_width = min_size * sqrt(ar) / 2.;
              box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              b_t[0] = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              b_t[1] = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              b_t[2] = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              b_t[3] = (center_y + box_height) / img_height;
-              idx++;
+              b_t += 4;
            }
            if (max_sizes.size() > 0) {
              auto max_size = max_sizes[s];
              // square prior with size sqrt(minSize * maxSize)
              box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              b_t[0] = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              b_t[1] = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              b_t[2] = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              b_t[3] = (center_y + box_height) / img_height;
-              idx++;
+              b_t += 4;
            }
          }
        }
@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    }
    if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
+      T* dt = boxes->data<T>();
-      ClipFunctor<T> clip_func;
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+        return std::min<T>(std::max<T>(v, 0.), 1.);
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+      });
-            boxes->data<T>(), clip_func);
    }
    framework::Tensor var_t;

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
@@ -37,32 +38,24 @@ struct EmbeddingVSumFunctor {
                  const LoDTensor *table_t, const LoDTensor *ids_t,
                  LoDTensor *output_t) {
    auto *table = table_t->data<T>();
-    int64_t row_number = table_t->dims()[0];
+    int64_t table_height = table_t->dims()[0];
-    int64_t row_width = table_t->dims()[1];
+    int64_t table_width = table_t->dims()[1];
-    int64_t last_dim = output_t->dims()[1];
+    int64_t out_width = output_t->dims()[1];
    const int64_t *ids = ids_t->data<int64_t>();
    auto ids_lod = ids_t->lod()[0];
-    int64_t ids_count = ids_t->numel() / ids_lod.back();
+    int64_t idx_width = ids_t->numel() / ids_lod.back();
    auto *output = output_t->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
-    for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
-      size_t begin = ids_lod[i] * ids_count;
-      for (int64_t j = 0; j != ids_count; ++j) {
-        PADDLE_ENFORCE_LT(ids[begin], row_number);
-        PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
-        blas.VCOPY(row_width, table + ids[begin + j] * row_width,
-                   output + i * last_dim + j * row_width);
-      }
-      for (int64_t r = (ids_lod[i] + 1) * ids_count;
+    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
-           r < ids_lod[i + 1] * ids_count; ++r) {
+                                  out_width, jit::SeqPoolType::kSum);
-        PADDLE_ENFORCE_LT(ids[r], row_number);
+    for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
-        PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
+      attr.index_height = ids_lod[i + 1] - ids_lod[i];
-        blas.AXPY(row_width, 1., table + ids[r] * row_width,
+      auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
-                  output + i * last_dim + (r % ids_count) * row_width);
+                                  platform::CPUPlace>(attr);
-      }
+      emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
+                  &attr);
    }
  }
 };

--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
  }
 };
+class GroupNormInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using InplaceInToOut::InplaceInToOut;
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return {{"X", "Y"}};
+  }
+};
+class GroupNormGradInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using InplaceInToOut::InplaceInToOut;
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
+  }
+};
+class GroupNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return {{"X", /*->*/ "Y"}};
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
-                  ops::GroupNormGradMaker);
+                  ops::GroupNormOpInferVarType, ops::GroupNormGradMaker,
-REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+                  ops::GroupNormInplaceInToOut);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
+                  ops::GroupNormGradInplaceInToOut);
 REGISTER_OP_CPU_KERNEL(
    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -301,6 +301,37 @@ void BenchSeqPoolKernel() {
  }
 }
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchEmbSeqPoolKernel() {
+  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
+  int64_t tbl_h = 1e4;
+  for (int tbl_w : {10, 16, 256}) {
+    Tensor table;
+    table.Resize({tbl_h, tbl_w});
+    RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
+    const T* table_data = table.data<T>();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          int64_t out_w = tbl_w * idx_w;
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          Tensor idx, out;
+          idx.Resize({idx_h, idx_w});
+          out.Resize({out_w});
+          RandomVec<int64_t>(idx_h * idx_w,
+                             idx.mutable_data<int64_t>(PlaceType()), 0,
+                             tbl_h - 1);
+          const int64_t* idx_data = idx.data<int64_t>();
+          T* o_data = out.mutable_data<T>(PlaceType());
+          BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>(
+              attr, table_data, idx_data, o_data, &attr);
+        }
+      }
+    }
+  }
+}
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
  for (int m : {1, 2, 3, 4}) {
@@ -339,6 +370,71 @@ void BenchSoftmaxKernel() {
  }
 }
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchLayerNormKernel() {
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        int sz = left * right;
+        Tensor x, mean, var, scale, bias, out;
+        x.Resize({n, x_dim_0, x_dim_1});
+        out.Resize({n, x_dim_0, x_dim_1});
+        mean.Resize({n, x_dim_0});
+        var.Resize({n, x_dim_0});
+        scale.Resize({x_dim_1});
+        bias.Resize({x_dim_1});
+        RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* scale_data = scale.data<T>();
+        const T* bias_data = bias.data<T>();
+        T* x_data = x.data<T>();
+        T* mean_data = mean.data<T>();
+        T* var_data = var.data<T>();
+        T* out_data = out.mutable_data<T>(PlaceType());
+        BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
+            right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+      }
+    }
+  }
+}
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchCRFDecodingKernel() {
+  constexpr int state_trans_base_idx = 2;
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : TestSizes()) {
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      Tensor x, w, alpha, track;
+      x.Resize({seq_len, tag_num});
+      w.Resize({tag_num + state_trans_base_idx, tag_num});
+      alpha.Resize({seq_len, tag_num});
+      track.Resize({seq_len, tag_num});
+      RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+      RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
+      const T* x_data = x.data<T>();
+      const T* w_data = w.data<T>();
+      T* alpha_data = alpha.mutable_data<T>(PlaceType());
+      int* track_data = track.mutable_data<int>(PlaceType());
+      BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
+          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
+    }
+  }
+}
 using T = float;
 using CPUPlace = paddle::platform::CPUPlace;
@@ -376,12 +472,27 @@ BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
 // seq pool function
 BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
+// embedding seq pool function
+BENCH_FP32_CPU(kEmbSeqPool) {
+  BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
+}
 // matmul
 BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
 // softmax
 BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
+// layernorm
+BENCH_FP32_CPU(kLayerNorm) {
+  BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
+}
+// crfdecoding
+BENCH_FP32_CPU(kCRFDecoding) {
+  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
+}
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:

--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -31,3 +31,4 @@ USE_JITKERNEL_GEN(kNCHW16CMulNC)
 USE_JITKERNEL_GEN(kSeqPool)
 USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
+USE_JITKERNEL_GEN(kEmbSeqPool)
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen/embseqpool.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+void EmbSeqPoolJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 8;
+  const int num_block = tbl_w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+  // protect param_dst
+  mov(reg_ptr_param_dst, param_dst);
+  mov(reg_idx_width_in_byte,
+      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
+  mov(reg_idx_height,
+      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
+  mov(rax, sizeof(int64_t));
+  mul(reg_idx_width_in_byte);
+  mov(reg_idx_width_in_byte, rax);
+  const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
+  int acc_num_regs = 0;
+  for (int num_regs : groups) {
+    Label l_next_idx_w, l_next_idx_h, l_save_now;
+    xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
+    mov(reg_ptr_dst_i, reg_ptr_param_dst);
+    add(reg_ptr_dst_i, acc_num_regs * block_size);
+    L(l_next_idx_w);
+    {
+      // h == 0
+      mov(reg_ptr_idx_i, param_idx);
+      add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
+      mov(reg_idx, qword[reg_ptr_idx_i]);
+      mov(rax, tbl_width_in_byte);
+      mul(reg_idx);
+      mov(reg_ptr_tbl_i, rax);        // reg is offset now
+      add(reg_ptr_tbl_i, param_tbl);  // reg is ptr_i now
+      size_t w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
+        w_offset += block_size;
+      }
+      add(reg_ptr_idx_i, reg_idx_width_in_byte);
+      // end condition of idx h
+      mov(reg_idx_h_end, reg_idx_height);
+      mov(rax, reg_idx_width_in_byte);
+      mul(reg_idx_h_end);
+      mov(reg_idx_h_end, rax);
+      add(reg_idx_h_end, reg_idx_w_i_in_byte);
+      add(reg_idx_h_end, param_idx);
+      cmp(reg_ptr_idx_i, reg_idx_h_end);
+      jge(l_save_now, T_NEAR);
+      L(l_next_idx_h);
+      {
+        mov(reg_idx, qword[reg_ptr_idx_i]);
+        mov(reg_ptr_tbl_i, reg_idx);
+        mov(rax, tbl_width_in_byte);
+        mul(reg_idx);
+        mov(reg_ptr_tbl_i, rax);
+        add(reg_ptr_tbl_i, param_tbl);
+        size_t w_offset = 0;
+        for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+          vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
+          vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs),
+                 ymm_t(reg_i));
+          w_offset += block_size;
+        }
+        add(reg_ptr_idx_i, reg_idx_width_in_byte);
+        cmp(reg_ptr_idx_i, reg_idx_h_end);
+        jl(l_next_idx_h, T_NEAR);
+      }  // end of idx h
+      L(l_save_now);
+      // avg or sqrt here, if needed
+      w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
+        w_offset += block_size;
+      }
+      add(reg_ptr_dst_i, tbl_width_in_byte);
+      add(reg_idx_w_i_in_byte, sizeof(int64_t));
+      cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
+      jl(l_next_idx_w, T_NEAR);
+    }  // end of idx w
+    acc_num_regs += num_regs;
+    add(param_tbl, num_regs * block_size);  // do not use acc_num_regs
+  }                                         // end of groups
+  postCode();
+}
+class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
+ public:
+  bool UseMe(const emb_seq_pool_attr_t& attr) const override {
+    return platform::MayIUse(platform::avx) &&
+           attr.table_width % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
+    return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const emb_seq_pool_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.table_height, 0);
+    PADDLE_ENFORCE_GT(attr.table_width, 0);
+    PADDLE_ENFORCE_GT(attr.index_height, 0);
+    PADDLE_ENFORCE_GT(attr.index_width, 0);
+    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
+  }
+};
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+namespace gen = paddle::operators::jit::gen;
+REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
--- a/paddle/fluid/operators/jit/gen/embseqpool.h
+++ b/paddle/fluid/operators/jit/gen/embseqpool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+class EmbSeqPoolJitCode : public JitCode {
+ public:
+  explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr,
+                             size_t code_size = 256 * 1024,
+                             void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        tbl_w_(attr.table_width),
+        type_(attr.pool_type) {
+    if (type_ != SeqPoolType::kSum) {
+      LOG(FATAL) << "Only support sum pool yet ";
+    }
+    this->genCode();
+  }
+  std::string name() const override {
+    std::string base = "EmbSeqPoolJitCode";
+    if (type_ == SeqPoolType::kSum) {
+      base += "_Sum";
+    } else if (type_ == SeqPoolType::kAvg) {
+      base += "_Avg";
+    } else if (type_ == SeqPoolType::kSqrt) {
+      base += "_Sqrt";
+    }
+    base += ("_W" + std::to_string(tbl_w_));
+    return base;
+  }
+  void genCode() override;
+ private:
+  int tbl_w_;
+  SeqPoolType type_;
+  reg64_t param_tbl{abi_param1};
+  reg64_t param_idx{abi_param2};
+  reg64_t param_dst{abi_param3};
+  reg64_t param_attr{abi_param4};
+  reg64_t reg_tmp{rax};
+  reg64_t reg_idx_width_in_byte{r8};
+  reg64_t reg_idx_height{r9};
+  reg64_t reg_ptr_tbl_i{r10};
+  reg64_t reg_idx{r10};  // could use same of reg_ptr_tbl_i
+  reg64_t reg_ptr_idx_i{r11};
+  reg64_t reg_ptr_dst_i{r12};
+  reg64_t reg_ptr_param_dst{r13};  // rdx is used in mul so protect param_dst
+  reg64_t reg_idx_w_i_in_byte{r14};
+  reg64_t reg_idx_h_end{r15};
+};
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -32,7 +32,7 @@ class SeqPoolJitCode : public JitCode {
      : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
    if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
          type_ == SeqPoolType::kSqrt)) {
-      LOG(FATAL) << "Only support sum pool yet ";
+      LOG(FATAL) << "Only supported pool type: sum, avg and sqrt.";
    }
    fp_h_[0] = 1.f;
    this->genCode();

--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -54,6 +54,7 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kHMax);
    ONE_CASE(kHSum);
    ONE_CASE(kSoftmax);
+    ONE_CASE(kEmbSeqPool);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
      return "NOT JITKernel";

--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -172,6 +172,15 @@ inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
  return os;
 }
+inline std::ostream& operator<<(std::ostream& os,
+                                const emb_seq_pool_attr_t& attr) {
+  os << "table_height[" << attr.table_height << "],table_width["
+     << attr.table_width << "],index_height[" << attr.index_height
+     << "],index_width[" << attr.index_width << "],output_width["
+     << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]";
+  return os;
+}
 inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
  return os;

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #pragma once
+#include <cstdint>
 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/platform/macros.h"
@@ -20,34 +21,35 @@ namespace paddle {
 namespace operators {
 namespace jit {
-// TODO(TJ): reorder by alphabet
 typedef enum {
  kNone = 0,
-  kVMul = 1,
+  // sort by alphabet
-  kVAdd = 2,
+  kCRFDecoding = 1,
-  kVAddRelu,
+  kEmbSeqPool = 2,
-  kVSub,
-  kVScal,
-  kVAddBias,
-  kVRelu,
-  kVIdentity,
-  kVSquare,
-  kVExp,
-  kVSigmoid,
-  kVTanh,
-  kLSTMCtHt,
-  kLSTMC1H1,
  kGRUH1,
  kGRUHtPart1,
  kGRUHtPart2,
-  kCRFDecoding,
+  kHSum,  // horizontal max
+  kHMax,  // horizontal sum
+  kLSTMCtHt,
+  kLSTMC1H1,
  kLayerNorm,
+  kMatMul,
  kNCHW16CMulNC,
  kSeqPool,
-  kMatMul,
-  kHSum,  // horizontal max
-  kHMax,  // horizontal sum
  kSoftmax,
+  kVAdd,
+  kVAddBias,
+  kVAddRelu,
+  kVExp,
+  kVIdentity,
+  kVMul,
+  kVRelu,
+  kVScal,
+  kVSigmoid,
+  kVSquare,
+  kVSub,
+  kVTanh,
 } KernelType;
 typedef enum {
@@ -145,6 +147,32 @@ struct SeqPoolTuples {
  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
+typedef struct emb_seq_pool_attr_s {
+  int64_t table_height, table_width;
+  int64_t index_height, index_width;
+  int64_t out_width;
+  SeqPoolType pool_type;
+  emb_seq_pool_attr_s() = default;
+  explicit emb_seq_pool_attr_s(int64_t tbl_height, int64_t tbl_width,
+                               int64_t idx_height, int64_t idx_width,
+                               int64_t output_width,
+                               SeqPoolType seqpool_type = SeqPoolType::kSum)
+      : table_height(tbl_height),
+        table_width(tbl_width),
+        index_height(idx_height),
+        index_width(idx_width),
+        out_width(output_width),
+        pool_type(seqpool_type) {}
+} emb_seq_pool_attr_t;
+template <typename T>
+struct EmbSeqPoolTuples {
+  typedef T data_type;
+  typedef emb_seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*, const int64_t*, T*,
+                            const emb_seq_pool_attr_t*);
+};
 typedef struct matmul_attr_s {
  int m, n, k;
  void* packed_weight{nullptr};

--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -56,6 +56,11 @@ size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
  return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
 }
+template <>
+size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
+  return attr.table_width;
+}
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -13,3 +13,4 @@ USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
+USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -174,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
  return true;
 }
+template <>
+bool EmbSeqPoolKernel<float>::UseMe(const emb_seq_pool_attr_t& attr) const {
+  return true;
+}
+template <>
+bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const {
+  return true;
+}
 template <>
 bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
  return platform::MayIUse(platform::avx);
@@ -227,6 +237,7 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool);
 REGISTER_MKL_KERNEL(kSoftmax, Softmax);
 #undef REGISTER_MKL_KERNEL
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -18,6 +18,7 @@
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace operators {
@@ -91,6 +92,32 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
  }
 }
+template <typename T>
+void EmbSeqPool(const T* table, const int64_t* idx, T* out,
+                const emb_seq_pool_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  auto check_idx_value_valid = [&](int64_t i) {
+    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+                      idx[i], i);
+    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+  };
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    VCopy<T>(table + idx[w] * attr->table_width, out + w * attr->table_width,
+             attr->table_width);
+  }
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAXPY<T>(static_cast<T>(1), table + idx[i] * attr->table_width,
+               out + w * attr->table_width, attr->table_width);
+    }
+  }
+}
 template <typename T>
 void ASum(const T* x, T* res, int n);
@@ -142,6 +169,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);
 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
+DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
 DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
 #undef DECLARE_MKL_KERNEL

--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -32,3 +32,4 @@ USE_JITKERNEL_REFER(kVSquare)
 USE_JITKERNEL_REFER(kHSum)
 USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
+USE_JITKERNEL_REFER(kEmbSeqPool)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -57,4 +57,6 @@ REGISTER_REFER_KERNEL(kHSum, HSum);
 REGISTER_REFER_KERNEL(kSoftmax, Softmax);
+REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -16,6 +16,7 @@
 #include <cmath>
 #include <limits>
+#include <string>
 #include "paddle/fluid/operators/jit/helper.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -414,6 +415,37 @@ void Softmax(const T* x, T* y, int n, int bs = 1) {
  }
 }
+// embedding seq pool
+// table is a matrix with (tbl_h, tbl_w)
+// idx is a matrix with (idx_h, idx_w)
+// output is a vector with length tbl_w * idx_w
+template <typename T>
+void EmbSeqPool(const T* table, const int64_t* idx, T* out,
+                const emb_seq_pool_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  auto check_idx_value_valid = [&](int64_t i) {
+    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+                      idx[i], i);
+    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+  };
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    std::memcpy(out + w * attr->table_width, table + idx[w] * attr->table_width,
+                attr->table_width * sizeof(T));
+  }
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAdd(table + idx[i] * attr->table_width, out + w * attr->table_width,
+           out + w * attr->table_width, attr->table_width);
+    }
+  }
+}
 #define DECLARE_REFER_KERNEL(name, tuples)             \
  template <typename T>                                \
  class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -462,6 +494,8 @@ DECLARE_REFER_KERNEL(HSum, XRNTuples);
 DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
+DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
 #undef DECLARE_REFER_KERNEL
 }  // namespace refer

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -270,6 +270,32 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
  }
 };
+template <typename T>
+struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
+                         std::vector<int64_t>, std::vector<T>,
+                         typename jit::EmbSeqPoolTuples<T>::attr_type> {
+  void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
+                  const std::vector<T>& table, const std::vector<int64_t>& idx,
+                  const std::vector<T>& oref,
+                  const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(table.size(),
+              static_cast<size_t>(attr.table_height * attr.table_width));
+    EXPECT_EQ(idx.size(),
+              static_cast<size_t>(attr.index_height * attr.index_width));
+    EXPECT_EQ(oref.size(),
+              static_cast<size_t>(attr.table_width * attr.index_width));
+    const T* table_data = table.data();
+    const int64_t* idx_data = idx.data();
+    const T* oref_data = oref.data();
+    int o_w = oref.size();
+    std::vector<T> out(o_w);
+    T* o_data = out.data();
+    tgt(table_data, idx_data, o_data, &attr);
+    ExpectEQ<T>(o_data, oref_data, o_w);
+  }
+};
 template <typename T>
 struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
                         std::vector<T>,
@@ -292,6 +318,63 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
  }
 };
+template <typename T>
+struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, int, float, int> {
+  void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
+                  std::vector<T>& x, std::vector<T>& outref,  // NOLINT
+                  std::vector<T>& mean, std::vector<T>& var,  // NOLINT
+                  const std::vector<T>& scale, const std::vector<T>& bias,
+                  int left, const float epsilon, int right) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
+    EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
+    EXPECT_EQ(mean.size(), static_cast<size_t>(left));
+    EXPECT_EQ(var.size(), static_cast<size_t>(left));
+    EXPECT_EQ(scale.size(), static_cast<size_t>(right));
+    EXPECT_EQ(bias.size(), static_cast<size_t>(right));
+    std::vector<T> outtgt(outref.size());
+    const T* scale_data = scale.data();
+    const T* bias_data = bias.data();
+    T* x_data = x.data();
+    T* mean_data = mean.data();
+    T* var_data = var.data();
+    T* outref_data = outref.data();
+    T* outtgt_data = outtgt.data();
+    tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
+        epsilon, right);
+    ExpectEQ<T>(outtgt_data, outref_data, left * right);
+  }
+};
+template <typename T>
+struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<int>,
+                         int> {
+  void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
+                  const int seq_len, const std::vector<T>& x,
+                  const std::vector<T>& w, std::vector<T>& alpharef,  // NOLINT
+                  std::vector<int>& trackref, int tag_num) {          // NOLINT
+    constexpr int state_trans_base_idx = 2;
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
+    EXPECT_EQ(w.size(),
+              static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
+    EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
+    EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
+    std::vector<T> alphatgt(alpharef.size());
+    std::vector<int> tracktgt(trackref.size());
+    memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
+    tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
+        tracktgt.data(), tag_num);
+    ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
+    ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
+  }
+};
 template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
          typename... Args>
 void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
@@ -587,6 +670,40 @@ void TestSoftmaxKernel() {
  }
 }
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestEmbSeqPoolKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  int64_t tbl_h = 1e4;
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum};  // only support sum yet
+  for (int tbl_w : TestSizes()) {
+    std::vector<T> table(tbl_h * tbl_w);
+    RandomVec<T>(tbl_h * tbl_w, table.data(), -2.f, 2.f);
+    const T* table_data = table.data();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
+          EXPECT_TRUE(ref != nullptr);
+          std::vector<int64_t> idx(idx_h * idx_w);
+          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
+          int64_t out_w = tbl_w * idx_w;
+          std::vector<T> oref(out_w);
+          const int64_t* idx_data = idx.data();
+          T* o_data = oref.data();
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          ref(table_data, idx_data, o_data, &attr);
+          TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
+                       std::vector<int64_t>, std::vector<T>>(attr, table, idx,
+                                                             oref, attr);
+        }
+      }
+    }
+  }
+}
 template <jit::KernelType KT, typename T, typename PlaceType>
 void TestNCHW16CMulNCKernel() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
@@ -640,6 +757,71 @@ void TestNCHW16CMulNCKernel() {
  }
 }
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestLayerNormKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
+        EXPECT_TRUE(ref != nullptr);
+        int sz = left * right;
+        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
+            outref(sz);
+        RandomVec<T>(sz, x.data(), -2.f, 2.f);
+        RandomVec<T>(left, mean.data(), -2.f, 2.f);
+        RandomVec<T>(left, var.data(), -2.f, 2.f);
+        RandomVec<T>(right, scale.data(), -2.f, 2.f);
+        RandomVec<T>(right, bias.data(), -2.f, 2.f);
+        const T* scale_data = scale.data();
+        const T* bias_data = bias.data();
+        T* x_data = x.data();
+        T* mean_data = mean.data();
+        T* var_data = var.data();
+        T* outref_data = outref.data();
+        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+        TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
+                     std::vector<T>, std::vector<T>, std::vector<T>,
+                     std::vector<T>, std::vector<T>, int, float>(
+            right, x, outref, mean, var, scale, bias, left, epsilon, right);
+      }
+    }
+  }
+}
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestCRFDecodingKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  constexpr int state_trans_base_idx = 2;
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : TestSizes()) {
+      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
+      std::vector<int> trackref(x_sz);
+      RandomVec<T>(x_sz, x.data(), -2.f, 2.f);
+      RandomVec<T>(w_sz, w.data(), -2.f, 2.f);
+      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
+          trackref.data(), tag_num);
+      TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
+                   std::vector<T>, std::vector<T>, std::vector<T>,
+                   std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
+                                          trackref, tag_num);
+    }
+  }
+}
 // XYZNTuple
 TEST(JITKernel, kVMul) {
  TestXYZNKernel<jit::kVMul, float, CPUPlace>();
@@ -756,12 +938,26 @@ TEST(JITKernel, kSoftmax) {
  TestSoftmaxKernel<jit::kSoftmax, double, CPUPlace>();
 }
+TEST(JITKernel, kEmbSeqPool) {
+  TestEmbSeqPoolKernel<jit::kEmbSeqPool, float, CPUPlace>();
+  TestEmbSeqPoolKernel<jit::kEmbSeqPool, double, CPUPlace>();
+}
 TEST(JITKernel, kNCHW16CMulNC) {
  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
 }
-// TODO(yihua/TJ): add crf decoding and layer norm unit tests
+TEST(JITKernel, kLayerNorm) {
+  TestLayerNormKernel<jit::kLayerNorm, float, paddle::platform::CPUPlace>();
+  TestLayerNormKernel<jit::kLayerNorm, double, paddle::platform::CPUPlace>();
+}
+TEST(JITKernel, kCRFDecoding) {
+  TestCRFDecodingKernel<jit::kCRFDecoding, float, paddle::platform::CPUPlace>();
+  TestCRFDecodingKernel<jit::kCRFDecoding, double,
+                        paddle::platform::CPUPlace>();
+}
 TEST(JITKernel, pool) {
  // TODO(TJ): add some test

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -64,7 +64,7 @@ class LoadCombineOp : public framework::OperatorBase {
      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
      // Get data from fin to tensor
      DeserializeFromStream(*buffer, tensor, dev_ctx);
@@ -90,6 +90,10 @@ class LoadCombineOp : public framework::OperatorBase {
        tensor->ShareDataWith(fp16_tensor);
      }
    }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
  }
 };

--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
      }
+      // lstm_value.output_value not used in bp, set to nullptr
+      // lstm_grad.state_active_grad not used in bp, set to nullptr
+      lstm_value.output_value = nullptr;
+      lstm_grad.state_active_grad = nullptr;
      int cur_batch_size = bend - bstart;
      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
      }
      int cur_batch_size = bend - bstart;
+      // lstmp_value.output_value not used in bp, set to null
+      // lstmp_grad.state_active_grad not used in bp, set to null
+      lstmp_value.output_value = nullptr;
+      lstmp_grad.state_active_grad = nullptr;
      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
          gate_act, cell_act, cand_act);

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -36,6 +36,8 @@ std::map<std::string,
        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
        {"batch_norm", NG_OPS::BuildBatchNormNode},
        {"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
+        {"cross_entropy", NG_OPS::BuildCrossEntropyNode},
+        {"cross_entropy_grad", NG_OPS::BuildCrossEntropyGradNode},
        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
        {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
        {"fill_constant", NG_OPS::BuildFillConstantNode},

--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "ops/batch_norm_op.h"
 #include "ops/binary_unary_op.h"
 #include "ops/conv2d_op.h"
+#include "ops/cross_entropy_op.h"
 #include "ops/elementwise_add_op.h"
 #include "ops/fill_constant_op.h"
 #include "ops/mean_op.h"

--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -44,6 +44,10 @@ void BuildBatchNormNode(
  const float epsilon = op_attrs.Get<float>("epsilon");
  const float momentum = op_attrs.Get<float>("momentum");
+  PADDLE_ENFORCE(
+      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
+      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
  if (data_layout == "NHWC") {
    x = paddle::platform::Nhwc2Nchw(x);
  }
@@ -110,6 +114,9 @@ void BuildBatchNormGradNode(
                 "BN grap input size needs to be 2 or 4");
  PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(),
                    "BN grap input and delta size needs to be equal");
+  PADDLE_ENFORCE(
+      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
+      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
  if (x_shape.size() == 2) {
    x = std::make_shared<ngraph::op::Reshape>(

--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <functional>
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+void BuildCrossEntropyNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto label_shape = label->get_shape();
+  auto x_shape = x->get_shape();
+  auto label_rank = label_shape.size();
+  auto x_rank = x_shape.size();
+  std::shared_ptr<ngraph::Node> x_2d = x, label_2d = label;
+  auto label_2d_shape = label_shape, x_2d_shape = x_shape;
+  if (label_rank > 2) {
+    label_2d_shape = paddle::platform::FlattenTo2d(label_shape, label_rank - 1);
+    label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
+  }
+  if (x_rank > 2) {
+    x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1);
+    x_2d = paddle::platform::NgReshaper(x, x_2d_shape);
+  }
+  auto batch_size = x_2d_shape.at(0);
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
+  if (!is_soft_label) {
+    auto label_1d = paddle::platform::NgReshaper(
+        label_2d, ngraph::Shape{label_2d_shape.at(0)});
+    node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
+  }
+  if (x->get_element_type() != node_1_hot->get_element_type()) {
+    node_1_hot = std::make_shared<ngraph::op::Convert>(node_1_hot,
+                                                       x->get_element_type());
+  }
+  auto node_log = std::make_shared<ngraph::op::Log>(x_2d);
+  auto high_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                                node_log->get_shape(), {1e20});
+  auto low_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                               node_log->get_shape(), {-1e20});
+  auto node_min = std::make_shared<ngraph::op::Minimum>(node_log, high_clip);
+  auto node_max = std::make_shared<ngraph::op::Maximum>(node_min, low_clip);
+  auto node_mul = node_1_hot * node_log;
+  auto node_sum =
+      std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
+  auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
+  auto xe =
+      paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
+  if (!is_soft_label) {
+    auto ignore_index = op_attrs.Get<int>("ignore_index");
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_2d_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label_2d, ignore_node);
+    auto mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                      xe->get_element_type());
+    xe = xe * mask;
+  }
+  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+}
+void BuildCrossEntropyGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  auto rank = x_shape.size();
+  std::shared_ptr<ngraph::Node> mask;
+  if (!is_soft_label) {
+    auto label_shape = label->get_shape();
+    label_shape.pop_back();
+    label = paddle::platform::NgReshaper(label, label_shape);
+    auto ignore_index = op_attrs.Get<int>("ignore_index");
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label, ignore_node);
+    mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                 x->get_element_type());
+    mask = std::make_shared<ngraph::op::Broadcast>(mask, x_shape,
+                                                   ngraph::AxisSet{rank - 1});
+    label = std::make_shared<ngraph::op::OneHot>(label, x_shape, rank - 1);
+  }
+  auto dy_shape = dy->get_shape();
+  dy_shape.pop_back();
+  auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape);
+  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
+      dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
+  if (x->get_element_type() != label->get_element_type()) {
+    label = std::make_shared<ngraph::op::Convert>(label, x->get_element_type());
+  }
+  auto xe_grad = -label * dy_bcast / x;
+  if (!is_soft_label) {
+    xe_grad = xe_grad * mask;
+  }
+  paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -46,8 +46,6 @@ void BuildFillConstantNode(
    ng_dtype = ngraph::element::i64;
  } else if (data_type == paddle::framework::proto::VarType::INT32) {
    ng_dtype = ngraph::element::i32;
-  } else if (data_type == paddle::framework::proto::VarType::BOOL) {
-    ng_dtype = ngraph::element::boolean;
  } else {
    PADDLE_THROW("unsupported data type: %s", data_type);
  }

--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve
 unidirectional recurrent neural networks. The row convolution operator is 
 different from the 1D sequence convolution, and is computed as follows:
-Given an input sequence $in$ of length $t$ and input dimension $d$, 
+Given an input sequence $X$ of length $t$ and input dimension $D$, 
-and a filter ($W$) of size $context \times d$, 
+and a filter ($W$) of size $context \times D$,
 the output sequence is convolved as:
 $$
-out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
+out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
 $$
 In the above equation:
 * $Out_{i}$: The i-th row of output variable with shape [1, D].
-* $\\tau$: Future context size.
+* $context$: Future context size.
 * $X_{j}$: The j-th row of input variable with shape [1, D].
-* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+* $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].
 More details about row_conv please refer to
 the design document

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -233,9 +233,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
 #endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
-#define PADDLE_THROW(...)                  \
+#define PADDLE_THROW(...)                                            \
-  throw ::paddle::platform::EnforceNotMet( \
+  do {                                                               \
-      ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
+    throw ::paddle::platform::EnforceNotMet(                         \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
 #define PADDLE_ENFORCE(COND, ...)                                         \
  do {                                                                    \
@@ -270,23 +272,25 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
 *    extra messages is also supported, for example:
 *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
 */
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                 \
-  do {                                                       \
+  do {                                                      \
-    if (UNLIKELY(nullptr == (__VAL))) {                      \
+    if (UNLIKELY(nullptr == (__VAL))) {                     \
-      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+      PADDLE_THROW(#__VAL " should not be null\n%s",        \
-                   paddle::string::Sprintf("" __VA_ARGS__)); \
+                   ::paddle::string::Sprintf(__VA_ARGS__)); \
-    }                                                        \
+    }                                                       \
  } while (0)
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
  do {                                                                  \
-    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+    auto __cond1__ = (__VAL0);                                          \
+    auto __cond2__ = (__VAL1);                                          \
+    if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) {                     \
      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
                   #__VAL0, #__VAL1, #__VAL0,                           \
-                   paddle::string::to_string(__VAL0), #__VAL1,          \
+                   ::paddle::string::to_string(__cond1__), #__VAL1,     \
-                   paddle::string::to_string(__VAL1),                   \
+                   ::paddle::string::to_string(__cond2__),              \
-                   paddle::string::Sprintf("" __VA_ARGS__));            \
+                   ::paddle::string::Sprintf(__VA_ARGS__));             \
    }                                                                   \
  } while (0)

--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 #include "paddle/fluid/pybind/ir.h"
+#include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -27,6 +29,10 @@ namespace py = pybind11;
 using paddle::framework::ir::Graph;
 using paddle::framework::ir::Node;
 using paddle::framework::ir::GraphSafeRemoveNodes;
+using paddle::framework::ir::HasCircle;
+using paddle::framework::ir::GraphNum;
+using paddle::framework::ir::TopologySortOperations;
+using paddle::framework::ir::BuildOperationAdjList;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::VarDesc;
@@ -36,6 +42,12 @@ namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
  m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
+  m->def("has_circle", HasCircle);
+  m->def("graph_num", GraphNum);
+  m->def("topology_sort", TopologySortOperations,
+         return_value_policy::reference);
+  m->def("build_adjacency_list", BuildOperationAdjList,
+         return_value_policy::reference);
  py::class_<Graph, std::shared_ptr<Graph>>(
      *m, "Graph",
      "The graph is a Directed Acyclic Single Static Assignment Graph, see "
@@ -46,7 +58,6 @@ void BindGraph(py::module *m) {
      .def("get_float", &Graph::Get<float>)
      .def("get_double", &Graph::Get<double>)
      .def("get_string", &Graph::Get<std::string>)
-      .def("get_program", &Graph::Get<ProgramDesc>)
      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
      .def("set", [](Graph &self, const std::string &attr_name,
                     int attr) { return self.Set(attr_name, new int(attr)); })
@@ -63,11 +74,6 @@ void BindGraph(py::module *m) {
           [](Graph &self, const std::string &attr_name, double attr) {
             return self.Set(attr_name, new double(attr));
           })
-      .def("set",
-           [](Graph &self, const std::string &attr_name,
-              const ProgramDesc &attr) {
-             return self.Set(attr_name, new ProgramDesc(attr));
-           })
      .def("set",
           [](Graph &self, const std::string &attr_name,
              const std::unordered_set<const Node *> &attr) {
@@ -108,42 +114,42 @@ void BindNode(py::module *m) {
      .def("is_op", &Node::IsOp)
      .def("is_var", &Node::IsVar)
      .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def("clear_inputs", [](Node &self) { self.inputs.clear(); })
      .def("inputs_remove",
           [](Node &self, int node_id) {
-             for (auto it = self.inputs.begin(); it != self.inputs.end();
+             auto pos = std::find_if(
-                  it++) {
+                 self.inputs.begin(), self.inputs.end(),
-               if ((*it)->id() == node_id) {
+                 [&node_id](const Node *n) { return n->id() == node_id; });
-                 self.inputs.erase(it);
+             if (pos != self.inputs.end()) {
-               }
+               self.inputs.erase(pos);
             }
           })
      .def("inputs_remove",
           [](Node &self, Node &node) {
-             for (auto it = self.inputs.begin(); it != self.inputs.end();
+             auto pos =
-                  it++) {
+                 std::find(self.inputs.begin(), self.inputs.end(), &node);
-               if (*it == &node) {
+             if (pos != self.inputs.end()) {
-                 self.inputs.erase(it);
+               self.inputs.erase(pos);
-               }
             }
           })
      .def("inputs_append",
           [](Node &self, Node &node) { self.inputs.push_back(&node); })
+      .def("clear_outputs", [](Node &self) { self.outputs.clear(); })
      .def("outputs_remove",
           [](Node &self, int node_id) {
-             for (auto it = self.outputs.begin(); it != self.outputs.end();
+             auto pos = std::find_if(
-                  it++) {
+                 self.outputs.begin(), self.outputs.end(),
-               if ((*it)->id() == node_id) {
+                 [&node_id](const Node *n) { return n->id() == node_id; });
-                 self.outputs.erase(it);
+             if (pos != self.outputs.end()) {
-               }
+               self.outputs.erase(pos);
             }
           })
      .def("outputs_remove",
           [](Node &self, Node &node) {
-             for (auto it = self.outputs.begin(); it != self.outputs.end();
+             auto pos =
-                  it++) {
+                 std::find(self.outputs.begin(), self.outputs.end(), &node);
-               if (*it == &node) {
+             if (pos != self.outputs.end()) {
-                 self.outputs.erase(it);
+               self.outputs.erase(pos);
-               }
             }
           })
      .def("outputs_append",

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -373,7 +373,13 @@ PYBIND11_MODULE(core, m) {
             PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
                            "the provided lod info is invalid");
             self.set_lod(new_lod);
-           })
+           },
+           py::arg("lod"), R"DOC(
+           Set LoD of the LoDTensor.
+           Args:
+               lod (List[List[int]]): the lod to be set.
+           )DOC")
      .def("set_recursive_sequence_lengths",
           [](LoDTensor &self, const std::vector<std::vector<size_t>>
                                   &recursive_sequence_lengths) {
@@ -389,7 +395,17 @@ PYBIND11_MODULE(core, m) {
                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
                 "the provided recursive_sequence_lengths info is invalid");
             self.set_lod(new_offset_lod);
-           })
+           },
+           py::arg("recursive_sequence_lengths"), R"DOC(
+           Set LoD of the LoDTensor according to recursive sequence length.
+           For example, if recursive_sequence_lengths=[[2, 3]], meaning that
+           there are two sequences with length 2 and 3 respectively, the 
+           corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].  
+           Args:
+                recursive_sequence_lengths (List[List[int]]): sequence lengths. 
+           )DOC")
      .def("lod",
           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
             // output the offset-based lod info
@@ -398,7 +414,13 @@ PYBIND11_MODULE(core, m) {
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
             return new_lod;
-           })
+           },
+           R"DOC(
+           Return the LoD of the LoDTensor.
+           Returns:
+               out (List[List[int]]): the lod of the LoDTensor.
+           )DOC")
      // Set above comments of set_lod.
      .def("recursive_sequence_lengths",
           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
@@ -408,12 +430,25 @@ PYBIND11_MODULE(core, m) {
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
             return new_lod;
-           })
+           },
-      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
+           R"DOC(
-        // Check that the lod info is valid and match the outermost
+           Return the sequence length of the LoDTensor corresponding to LoD.
-        // dimension of the LoDTensor data
-        return CheckLoD(self.lod(), vectorize(self.dims()).front());
+           Returns:
-      });
+               out (List[List[int]): the sequence lengths. 
+           )DOC")
+      .def("has_valid_recursive_sequence_lengths",
+           [](LoDTensor &self) -> bool {
+             // Check that the lod info is valid and match the outermost
+             // dimension of the LoDTensor data
+             return CheckLoD(self.lod(), vectorize(self.dims()).front());
+           },
+           R"DOC(
+           Check whether the lod of the LoDTensor is valid.
+           Returns:
+               out (bool): whether the lod is valid.
+           )DOC");
  py::class_<SelectedRows>(m, "SelectedRows")
      .def("__init__",
@@ -549,11 +584,45 @@ All parameter, weight, gradient are variables in Paddle.
           [](Scope &self, const std::string &name) -> Variable * {
             return self.Var(name);
           },
+           py::arg("name"),
+           R"DOC(
+           Find or create variable named :code:`name` in the current scope. 
+           If the variable named :code:`name` does not exist in the 
+           current scope, the variable would be created. Otherwise,
+           return the existing variable. 
+           Args:
+               name (str): the variable name.  
+           Returns:
+               out (core.Variable): the found or created variable. 
+           )DOC",
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::arg("name"),
+           R"DOC(
+           Find variable named :code:`name` in the current scope or 
+           its parent scope. Return None if not found.
+           Args:
+               name (str): the variable name.
+           Returns:
+               out (core.Variable|None): the found variable or None.   
+           )DOC",
           py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           R"DOC(
+           Create a new sub-scope of the current scope.
+           Returns:
+               out (core._Scope): the created sub-scope.
+           )DOC",
           py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids);
+      .def("drop_kids", &Scope::DropKids,
+           R"DOC(
+           Delete all sub-scopes of the current scope.
+           )DOC");
  m.def("Scope",
        []() -> Scope * {
@@ -561,6 +630,12 @@ All parameter, weight, gradient are variables in Paddle.
          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
          return s;
        },
+        R"DOC(
+        Create a new scope.
+        Returns:
+            out (core._Scope): the created scope.
+        )DOC",
        py::return_value_policy::reference);
  //! @note: Be careful! PyBind will return std::string as an unicode, not
@@ -789,11 +864,13 @@ All parameter, weight, gradient are variables in Paddle.
             self[i].ShareDataWith(t);
             self[i].set_lod(t.lod());
           })
-      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
+      .def("append",
-        self.emplace_back();
+           [](LoDTensorArray &self, const LoDTensor &t) {
-        self.back().ShareDataWith(t);
+             self.emplace_back();
-        self.back().set_lod(t.lod());
+             self.back().ShareDataWith(t);
-      });
+             self.back().set_lod(t.lod());
+           },
+           py::arg("tensor"), "Append a LoDensor to LoDTensorArray.");
  m.def("IsInplace",
        [](std::string op) -> bool { return operators::IsInplace(op); });
@@ -829,8 +906,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);
-  m.def("get_pass", [](const py::bytes &binary_str) {
+  m.def("get_pass", [](const std::string &pass_type) {
-    std::string pass_type(binary_str);
    auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
    return std::shared_ptr<framework::ir::Pass>(std::move(pass));
  });
@@ -838,10 +914,9 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
  pass.def(py::init())
      .def("has", &ir::Pass::Has)
-      .def("set",
+      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name,
+           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-              const ProgramDesc &attr) {
+             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-             return self.Set(attr_name, new ProgramDesc(attr));
           })
      .def(
          "set",
@@ -850,7 +925,6 @@ All parameter, weight, gradient are variables in Paddle.
          })
      .def("set", [](ir::Pass &self, const std::string &name,
                     int val) { self.Set<const int>(name, new int(val)); })
-      .def("get_program", &ir::Pass::Get<ProgramDesc>)
      .def("type", &ir::Pass::Type)
      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
        std::unique_ptr<ir::Graph> origin_graph(graph.get());

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -88,6 +88,7 @@ function cmake_gen() {
            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.5 uninstall -y protobuf
                pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
            else
                exit 1
@@ -101,6 +102,7 @@ function cmake_gen() {
            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.6 uninstall -y protobuf
                pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
            else
                exit 1
@@ -114,6 +116,7 @@ function cmake_gen() {
            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.7 uninstall -y protobuf
                pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
            else
                exit 1
@@ -128,31 +131,44 @@ function cmake_gen() {
                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
            -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+                pip uninstall -y protobuf
+                pip install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "cp27-cp27mu" ]; then
                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
                export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
            -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+                pip uninstall -y protobuf
+                pip install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "cp35-cp35m" ]; then
                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
                export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+                pip3.5 uninstall -y protobuf
+                pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "cp36-cp36m" ]; then
                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+                pip3.6 uninstall -y protobuf
+                pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "cp37-cp37m" ]; then
                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
+                pip3.7 uninstall -y protobuf
+                pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
           fi
+        else
+            pip uninstall -y protobuf
+            pip install -r ${PADDLE_ROOT}/python/requirements.txt
        fi
    fi

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -64,6 +64,7 @@ if (WITH_TESTING)
  add_subdirectory(paddle/dataset/tests)
  add_subdirectory(paddle/fluid/tests)
  add_subdirectory(paddle/fluid/contrib/tests)
+  add_subdirectory(paddle/fluid/contrib/slim/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
    DESTINATION opt/paddle/share/wheels

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -177,7 +177,10 @@ class CompiledProgram(object):
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
-        self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+        if self._build_strategy.memory_optimize is None:
+            self._build_strategy.memory_optimize = False if main._is_mem_optimized else True
+        if self._build_strategy.enable_inplace is None:
+            self._build_strategy.enable_inplace = False if main._is_mem_optimized else True
        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
            assert self._build_strategy.num_trainers == len(

--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -63,10 +63,10 @@ Notes:
 ## 4. How to reproduce the results
 * Small dataset
 ```bash
-python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 * Full dataset
 ```bash
-DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
--- a/python/paddle/fluid/contrib/slim/unitest/__init__.py
+++ b/python/paddle/fluid/contrib/slim/unitest/__init__.py
--- a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
 version: 1.0
-include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
 pruners:
    pruner_1:
        class: 'RatioPruner'

--- a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
--- a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
--- a/python/paddle/fluid/contrib/slim/unitest/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
@@ -18,7 +18,7 @@ import unittest
 class TestFactory(unittest.TestCase):
    def test_parse(self):
-        factory = ConfigFactory('./unitest/configs/config.yaml')
+        factory = ConfigFactory('./configs/config.yaml')
        pruner = factory.instance('pruner_1')
        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)

--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+class TestGraph(unittest.TestCase):
+    def test_graph_functions(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('conv2d') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'residual', marked_nodes)
+        self.assertFalse(graph.has_circle())
+        self.assertEqual(graph.graph_num(), 1)
+        nodes = graph.topology_sort()
+        self.assertEqual(len(nodes), len(graph.all_ops()))
+        nodes_map = graph.build_adjacency_list()
+        self.assertEqual(len(nodes_map), len(graph.all_ops()))
+        nodes_num = len(graph.all_nodes())
+        graph.safe_remove_nodes(marked_nodes)
+        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
@@ -17,9 +17,12 @@ import random
 import numpy as np
 import paddle.fluid as fluid
 import six
-from paddle.fluid.framework import Program
+import paddle
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
+from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
+from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid import core
@@ -65,6 +68,28 @@ def residual_block(num):
    return loss
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
 class TestQuantizationTransformPass(unittest.TestCase):
    def setUp(self):
        self.quantizable_op_and_inputs = {
@@ -171,5 +196,177 @@ class TestQuantizationTransformPass(unittest.TestCase):
        self.residual_block_quant('range_abs_max')
+class TestQuantizationFreezePass(unittest.TestCase):
+    def freeze_graph(self, use_cuda, seed, quant_type):
+        def build_program(main, startup, is_test):
+            main.random_seed = seed
+            startup.random_seed = seed
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    loss = conv_net(img, label)
+                    if not is_test:
+                        opt = fluid.optimizer.Adam(learning_rate=0.001)
+                        opt.minimize(loss)
+            return [img, label], loss
+        random.seed(0)
+        np.random.seed(0)
+        main = fluid.Program()
+        startup = fluid.Program()
+        test_program = fluid.Program()
+        feeds, loss = build_program(main, startup, False)
+        build_program(test_program, startup, True)
+        test_program = test_program.clone(for_test=True)
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        transform_pass = QuantizationTransformPass(
+            scope=scope, program_exe=exe, activation_quantize_type=quant_type)
+        transform_pass.apply(main_graph)
+        transform_pass.apply(test_graph)
+        dev_name = '_gpu_' if use_cuda else '_cpu_'
+        marked_nodes = set()
+        for op in main_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
+        quantized_main_program = main_graph.to_program()
+        quantized_test_program = test_graph.to_program()
+        iters = 5
+        batch_size = 8
+        #train_exe = fluid.ParallelExecutor(
+        #    main_program=quantized_main_program,
+        #    use_cuda=bool(use_cuda),
+        #    loss_name=loss.name,
+        #    scope=scope)
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=500),
+            batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+        with fluid.scope_guard(scope):
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(program=quantized_main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                #loss_v = train_exe.run(feed=feeder.feed(data),
+                #                       fetch_list=[loss.name])
+                #print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
+        test_data = next(test_reader())
+        with fluid.program_guard(quantized_test_program):
+            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+                                             quantized_test_program)
+        # Testing
+        with fluid.scope_guard(scope):
+            test_loss1, w_quant = exe.run(program=quantized_test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[loss, w_var])
+        # Freeze graph for inference, but the weight of fc/conv is still float type.
+        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
+        freeze_pass.apply(test_graph)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
+                        marked_nodes)
+        server_program = test_graph.to_program()
+        with fluid.scope_guard(scope):
+            test_loss2, = exe.run(program=server_program,
+                                  feed=feeder.feed(test_data),
+                                  fetch_list=[loss])
+        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
+        #print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
+        #print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
+        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
+        # Maybe failed, this is due to the calculation precision
+        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
+        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+        #                      np.sum(w_freeze)))
+        #print('{}: {}'.format('w_quant' + dev_name + quant_type,
+        #                      np.sum(w_quant)))
+        # Convert parameter to 8-bit.
+        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
+        convert_int8_pass.apply(test_graph)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes)
+        server_program_int8 = test_graph.to_program()
+        # Save the 8-bit parameter and model file.
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model('server_int8' + dev_name + quant_type,
+                                          ['image', 'label'], [loss], exe,
+                                          server_program_int8)
+            # Test whether the 8-bit parameter and model file can be loaded successfully.
+            [infer, feed, fetch] = fluid.io.load_inference_model(
+                'server_int8' + dev_name + quant_type, exe)
+        # Check the loaded 8-bit weight.
+        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
+        self.assertEqual(w_8bit.dtype, np.int8)
+        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+        #print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
+        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+        #                      np.sum(w_freeze)))
+        mobile_pass = TransformForMobilePass()
+        mobile_pass.apply(test_graph)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
+                        marked_nodes)
+        mobile_program = test_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type,
+                                          ['image', 'label'], [loss], exe,
+                                          mobile_program)
+    def test_freeze_graph_cuda_dynamic(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_graph(True, seed=1, quant_type='abs_max')
+    def test_freeze_graph_cpu_dynamic(self):
+        with fluid.unique_name.guard():
+            self.freeze_graph(False, seed=2, quant_type='abs_max')
+    def test_freeze_graph_cuda_static(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_graph(True, seed=1, quant_type='range_abs_max')
+    def test_freeze_graph_cpu_static(self):
+        with fluid.unique_name.guard():
+            self.freeze_graph(False, seed=2, quant_type='range_abs_max')
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
 endif()
 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
+    if(src MATCHES "test_calibration")
+        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
+    else()
+        py_test(${src} SRCS ${src}.py)
+    endif()
 endforeach()
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
    def run_program(self, model_path, generate_int8=False, algo='direct'):
        image_shape = [3, 224, 224]
-        os.environ['FLAGS_use_mkldnn'] = 'True'
        fluid.memory_optimize(fluid.default_main_program())
@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
            label = label.reshape([-1, 1])
            running_program = calibrator.sampling_program.clone(
            ) if generate_int8 else infer_program.clone()
-            for op in running_program.current_block().ops:
-                if op.has_attr("use_mkldnn"):
-                    op._set_attr("use_mkldnn", True)
            t1 = time.time()
            _, acc1, _ = exe.run(

--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -204,9 +204,11 @@ class TestQuantizeTranspiler(unittest.TestCase):
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
-        quant_transpiler = QuantizeTranspiler()
+        quant_type = 'range_abs_max'  # 'range_abs_max' or 'abs_max'
-        quant_transpiler.training_transpile(main)
+        quant_transpiler = QuantizeTranspiler(
-        quant_transpiler.training_transpile(test_program)
+            activation_quantize_type=quant_type)
+        quant_transpiler.training_transpile(main, startup)
+        quant_transpiler.training_transpile(test_program, startup)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 import collections
 from collections import defaultdict
+from collections import Iterable
+import contextlib
 from .wrapped_decorator import signature_safe_contextmanager
 import os
 import re
@@ -555,7 +557,8 @@ class OpProtoHolder(object):
        return {
            core.op_proto_and_checker_maker.kOpRoleAttrName(),
            core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName()
+            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
        }
@@ -1529,12 +1532,16 @@ class Block(object):
 class IrGraph(object):
    """
-    IrGraph uses core.Graph as the delegation to accomplish the manipulation.
+    Python IrGraph. Beneath it is a core.Graph, which is used for
+    create a c++ Ir Pass Graph. An IrGraph is just a graph view of
+    a Program. In an IrGraph, both Variables and Operators are graph
+    nodes.
    """
    def __init__(self, graph, for_test=False):
        """
-        Construct the IrGraph using core.Graph.
+        Construct an IrGraph using core.Graph.
        Args:
            graph(core.Graph): C++ Graph.
            for_test(bool): True for the test graph and false for the train graph.
@@ -1545,23 +1552,81 @@ class IrGraph(object):
        self._for_test = for_test
    def is_test(self):
+        """
+        If the graph is used for testing, the function returns true. Otherwise, returns false.
+        """
        return self._for_test
-    def all_parameters(self):
+    def all_nodes(self):
-        param_nodes = set()
+        """
-        for node in self.graph.nodes():
+        Return all nodes included in the graph as a set.
-            if node.is_var() and node.var() is not None and node.var(
+        """
-            ).persistable():
+        return {node for node in self.graph.nodes()}
-                param_nodes.add(node)
-        return param_nodes
    def all_vars(self):
+        """
+        Return all variable nodes included in the graph as a set.
+        """
        return {node for node in self.graph.nodes() if node.is_var()}
+    def all_persistable_vars(self):
+        """
+        Return all persistable variable nodes included in the graph as a set.
+        """
+        persistable_nodes = set()
+        for node in self.graph.nodes():
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                persistable_nodes.add(node)
+        return persistable_nodes
    def all_ops(self):
+        """
+        Return all operator nodes included in the graph as a set.
+        """
        return {node for node in self.graph.nodes() if node.is_op()}
+    def var_node(self, name):
+        """
+        Get a variable node by name from the graph.
+        Args:
+            name(str): the name of the variable node.
+        Raises:
+            ValueError: The If input's type is not str, or this graph
+            doesn't have a variable with the giving name.
+        Returns:
+            core.Node: the variable node with the giving name.
+        """
+        if not isinstance(name, six.string_types):
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
+        target_var_node = None
+        var_nodes = self.all_vars()
+        for var_node in var_nodes:
+            if var_node.name() == name:
+                target_var_node = var_node
+        if target_var_node is None:
+            raise ValueError("var_node %s not in this graph" % name)
+        return target_var_node
    def create_param_node(self, name, var_type, shape, var_dtype):
+        """
+        Create a persistable variable node in the graph. In IrGraph,
+        it can not distinguish between persistable variables and parameters.
+        Args:
+            name(str): the name of the persistable variable node.
+            vart_type(core.VarDesc.VarType): the type of the persistable variable node.
+            shape(list): the shape of the persistable variable node.
+            var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
+        Returns:
+            core.Node: the created persistable variable node.
+        """
        var_desc = core.VarDesc(name)
        var_desc.set_type(var_type)
        var_desc.set_shape(shape)
@@ -1570,6 +1635,20 @@ class IrGraph(object):
        return self.graph.create_var_node(var_desc)
    def create_var_node(self, name, var_type, shape, var_dtype):
+        """
+        Create a variable node in the graph. The created variable node is
+        not persistable.
+        Args:
+            name(str): the name of the variable node.
+            vart_type(core.VarDesc.VarType): the type of the variable node.
+            shape(list): the shape of the variable node.
+            var_dtype(core.VarDesc.VarType): the data type of the variable node.
+        Returns:
+            core.Node: the created variable node.
+        """
        var_desc = core.VarDesc(name)
        var_desc.set_type(var_type)
        var_desc.set_shape(shape)
@@ -1577,19 +1656,41 @@ class IrGraph(object):
        return self.graph.create_var_node(var_desc)
    def create_var_node_from_desc(self, var_desc):
+        """
+        Create a variable node by using an existing VarDesc in the graph.
+        Depend on the giving VarDesc, the created variable node may be persistable.
+        Args:
+            var_desc(core.VarDesc): the giving variable description.
+        Returns:
+            core.Node: the created variable node.
+        """
        return self.graph.create_var_node(var_desc)
    def create_op_node(self, op_type, attrs, inputs, outputs):
+        """
+        Create a operator node in the graph.
+        Args:
+            op_type(str): the type of the operator node.
+            attrs(dict): the attributes of the operator node.
+            inputs(dict): the inputs of the operator node.
+            outputs(dict): the outpus of the operator node.
+        Returns:
+            core.Node: the created operator node.
+        """
        op_desc = core.OpDesc()
        op_desc.set_type(op_type)
-        for attr, value in attrs.iteritems():
+        for attr, value in six.iteritems(attrs):
            self._update_desc_attr(op_desc, attr, value)
-        for input_name, var_nodes in inputs.iteritems():
+        for input_name, var_nodes in six.iteritems(inputs):
            if not isinstance(var_nodes, list):
                var_nodes = [var_nodes]
            op_desc.set_input(input_name,
                              [var_node.name() for var_node in var_nodes])
-        for output_name, var_nodes in outputs.iteritems():
+        for output_name, var_nodes in six.iteritems(outputs):
            if not isinstance(var_nodes, list):
                var_nodes = [var_nodes]
            op_desc.set_output(output_name,
@@ -1597,11 +1698,29 @@ class IrGraph(object):
        return self.graph.create_op_node(op_desc)
    def create_op_node_from_desc(self, op_desc):
+        """
+        Create a operator node by using an existing OpDesc in the graph.
+        Args:
+            op_desc(core.VarDesc): the giving operator description.
+        Returns:
+            core.Node: the created operator node.
+        """
        return self.graph.create_op_node(op_desc)
    def update_input_link(self, old_input_node, new_input_node, op_node):
-        assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \
+        """
-            op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.'
+        Update the input's link of a operator node.
+        Args:
+            old_input_node(core.Node): the old input node of the giving op_node.
+            new_input_node(core.Node): the new input node of the giving op_node.
+            op_node(core.Node): the operator node that is needed to update input's link.
+        """
+        assert old_input_node in self.graph.nodes() and new_input_node in \
+        self.graph.nodes() and op_node in self.graph.nodes(), \
+        'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
        old_input_node.outputs_remove(op_node)
        op_node.inputs_remove(old_input_node)
        new_input_node.outputs_append(op_node)
@@ -1609,17 +1728,85 @@ class IrGraph(object):
        op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
    def link_to(self, node_in, node_out):
+        """
+        Connect two nodes.
+        Args:
+            node_in(core.Node): the input node.
+            node_out(core.Node): the output node.
+        """
        assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
-            'Th two arguments must be in the graph nodes.'
+            'The two arguments(node_in&node_out) must be in the graph nodes.'
        node_in.outputs_append(node_out)
        node_out.inputs_append(node_in)
    def safe_remove_nodes(self, remove_nodes):
+        """
+        Remove nodes safely since links connected to these removed nodes are
+        also removed.
+        Args:
+            remove_nodes(set): the nodes prepared to be removed.
+        """
        if not isinstance(remove_nodes, set):
-            remove_nodes = set(remove_nodes)
+            if isinstance(remove_nodes, Iterable):
+                remove_nodes = set(remove_nodes)
+            else:
+                remove_nodes = {remove_nodes}
        core.graph_safe_remove_nodes(self.graph, remove_nodes)
-    def draw(self, save_path, name, marked_nodes=None):
+    def has_circle(self):
+        """
+        Check if the graph has a circle.
+        Returns:
+            bool: True if the graph has a circle else False.
+        """
+        return core.has_circle(self.graph)
+    def graph_num(self):
+        """
+        Count the number of unconnected graphs in this graph.
+        Returns:
+            int: the number of unconnected graphs.
+        """
+        return core.graph_num(self.graph)
+    def topology_sort(self):
+        """
+        Perform the topology sort operation on the graph.
+        Notes: the `graph` cannot contain a circle.
+        Returns:
+            set(core.Node): nodes in topology order.
+        """
+        return core.topology_sort(self.graph)
+    def build_adjacency_list(self):
+        """
+        Build an adjacency list of operations for the `graph`.
+        Returns:
+            dict{core.Node: set(core.Node)}: the adjacency list.
+        """
+        return core.build_adjacency_list(self.graph)
+    def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
+        """
+        Draw the graph. If `dot` command is installed, the drawn graph
+        will be saved as pdf file type, otherwise dot file type is used.
+        Args:
+            save_path(str): the save path of drawn graph.
+            name(str): the name of drawn graph.
+            marked_nodes(set(core.Node)): nodes that are needed to be marked.
+            Default value is None.
+            remove_ctr_var(bool): If it is set True, all control variable nodes
+            in the graph will be removed. Default value is True.
+        """
        def _convert_to_pdf(dot_file_path):
            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
@@ -1629,15 +1816,17 @@ class IrGraph(object):
                print('The {} is saved as the dot filetype.'.format(
                    dot_file_path))
-        remove_ctr_vars = set()
+        if remove_ctr_var:
+            remove_ctr_vars = set()
+            for node in self.graph.nodes():
+                if node.is_ctrl_var():
+                    remove_ctr_vars.add(node)
+            self.safe_remove_nodes(remove_ctr_vars)
        ops_num = 0
        for node in self.graph.nodes():
-            if node.is_ctrl_var():
+            if node.is_op():
-                remove_ctr_vars.add(node)
-            elif node.is_op():
                ops_num += 1
        print('Total ops num = {}.'.format(ops_num))
-        self.safe_remove_nodes(remove_ctr_vars)
        if marked_nodes is not None:
            if not isinstance(marked_nodes, set):
                marked_nodes = set(marked_nodes)
@@ -1652,10 +1841,20 @@ class IrGraph(object):
        _convert_to_pdf(viz_dot_path)
    def to_program(self):
+        """
+        Convert the graph into a Program.
+        Notes: When the graph includes backward operator nodes, the
+        conversion process may be failed. Usually, this function is
+        only used to convert a test graph.
+        Returns:
+            Program: a program converted from the graph.
+        """
        convert_pass = core.get_pass('graph_to_program_pass')
-        convert_pass.set('program', Program().desc)
+        desc = core.ProgramDesc()
+        convert_pass.set_not_owned('program', desc)
        convert_pass.apply(self.graph)
-        desc = convert_pass.get_program('program')
        program = Program._construct_from_desc(desc)
        return program

--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import collections
 import contextlib
 import sys
 import numpy as np
@@ -30,31 +31,45 @@ class Layer(core.Layer):
    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
        self._built = False
        self._dtype = dtype
+        self._parameters = collections.OrderedDict()
+        self._sub_layers = collections.OrderedDict()
+    def parameters(self, include_sublayers=True):
+        """Returns a list of Parameters from current and sub-layers.
+        Args:
+            include_sublayers: If true, also include the parameters from
+            sublayers.
+        Returns a list of Parameters.
+        """
+        ret = [p for p in self._parameters.values()]
+        if include_sublayers:
+            for l in self._sub_layers.values():
+                for p in l.parameters(include_sublayers):
+                    ret.append(p)
+        return ret
+    def sublayers(self, include_sublayers=True):
+        """Returns a list of sub layers.
-    def parameters(self):
+        Args:
-        params = []
+            include_sublayers: If true, also include the layers from sublayers.
-        for key in self.__dict__.keys():
-            value = self.__dict__[key]
+        Returns a list of sub layers.
-            if isinstance(value, framework.Parameter):
+        """
-                params.append(value)
+        ret = [l for l in self._sub_layers.values()]
-            elif isinstance(value, core.Layer):
+        if include_sublayers:
-                params.extend(value.parameters())
+            for l in self._sub_layers.values():
-            elif isinstance(value, collections.Container):
+                for sub_l in l.sublayers(include_sublayers):
-                if len(value) == 0:
+                    ret.append(sub_l)
-                    continue
+        return ret
-                if isinstance(value[0], framework.Parameter):
-                    params.extend(value)
-                elif isinstance(value[0], core.Layer):
-                    for v in value:
-                        params.extend(v.parameters())
-        return params
    def clear_gradients(self):
        for p in self.parameters():
            p._clear_gradient()
-    def _build_once(self, inputs):
+    def _build_once(self, *args):
        pass
    def __call__(self, *inputs):
@@ -71,6 +86,66 @@ class Layer(core.Layer):
    def backward(self, *inputs):
        raise ValueError("Layer shouldn't implement backward")
+    def add_sublayer(self, name, sublayer):
+        """Adds a sub Layer instance.
+          Added sublayer can be access like self.name.
+        Args:
+            name: name of this sublayer.
+            sublayer: an instance of Layer.
+        Returns:
+            the sublayer passed in.
+        """
+        assert isinstance(sublayer, core.Layer)
+        self._sub_layers[name] = sublayer
+        return sublayer
+    def add_parameter(self, name, parameter):
+        """Adds a Parameter instance.
+          Added parameter can be access like self.name.
+        Args:
+            name: name of this sublayer.
+            parameter: an instance of Parameter.
+        Returns:
+            the parameter passed in.
+        """
+        assert isinstance(parameter, framework.Parameter)
+        self._parameters[name] = parameter
+        return parameter
+    def __getattr__(self, name):
+        if name in self._parameters:
+            return self._parameters[name]
+        elif name in self._sub_layers:
+            return self._sub_layers[name]
+    def __setattr__(self, name, value):
+        if isinstance(value, framework.Parameter):
+            params = self.__dict__.get('_parameters', None)
+            if params is None:
+                raise ValueError(
+                    "super(YourLayer, self).__init__() should be called first")
+            params[name] = value
+        elif isinstance(value, core.Layer):
+            layers = self.__dict__.get('_sub_layers', None)
+            if layers is None:
+                raise ValueError(
+                    "super(YourLayer, self).__init__() should be called first")
+            layers[name] = value
+        else:
+            object.__setattr__(self, name, value)
+    def __delattr__(self, name):
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._sub_layers:
+            del self._sub_layers[name]
+        else:
+            object.__delattr__(self, name)
 class PyLayer(core.PyLayer):
    """Layers composed of user-defined python codes."""

--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -225,9 +225,6 @@ class FC(layers.Layer):
            act=act,
            name=name)
-    def parameters(self):
-        return [self._w, self._b]
    def _build_once(self, input):
        input_shape = input.shape
        param_shape = [
@@ -478,9 +475,6 @@ class Embedding(layers.Layer):
            dtype=self._dtype,
            is_bias=False)
-    def parameters(self):
-        return [self._w]
    def forward(self, input):
        out = self._helper.create_variable_for_type_inference(self._dtype)
        self._helper.append_op(

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -506,9 +506,9 @@ class While(object):
    while loop control flow.
    Args:
-        cond (Variable): condition used to compare.
+        cond(Variable): condition used to compare.
        is_test(bool): A flag indicating whether execution is in test phase.
-        name (str): The name of this layer.
+        name(str): The name of this layer.
    Examples:
          .. code-block:: python
@@ -589,7 +589,8 @@ class While(object):
 def lod_rank_table(x, level=0):
-    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    """
+    LoD Rank Table Operator. Given an input variable **x** and a level number
    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
    contains a list of bi-element tuples. Each tuple consists of an index and
    a length, both of which are int type. Refering to specified level of LoD,
@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
    return cond
-def equal(x, y, cond=None, **ignored):
+def equal(x, y, cond=None):
    """
-    **equal**
    This layer returns the truth value of :math:`x == y` elementwise.
    Args:
@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
        Returns:
            The current timestep in the input sequence.
        """
        self._assert_in_rnn_block_("step_input")
        if not isinstance(x, Variable):
@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
    @signature_safe_contextmanager
    def block(self):
        """
-        The block for user to define operators in RNN. See the class docstring
+        The block for user to define operators in RNN.
-        for more details.
        """
        if self.status != DynamicRNN.BEFORE_RNN:
            raise ValueError("rnn.block() can only be invoke once")
@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
            dtype(str|numpy.dtype): The data type of the initialized memory.
        Returns:
-            the memory variable.
+            The memory variable.
        """
        self._assert_in_rnn_block_('memory')
        self._init_zero_idx_()
@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
    def output(self, *outputs):
        """
-        mark the RNN output variables.
+        Mark the RNN output variables.
        Args:
            outputs: The output variables.

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,10 @@ def data(name,
    Args:
       name(str): The name/alias of the function
-       shape(list): Tuple declaring the shape.
+       shape(list): Tuple declaring the shape. If :code:`append_batch_size` is 
+                    True and there is no -1 inside :code:`shape`, it should be 
+                    considered as the shape of the each sample. Otherwise, it
+                    should be considered as the shape of the batched data.  
       append_batch_size(bool):
          1. If true, it prepends -1 to the shape.
            For example if shape=[1], the resulting shape is [-1, 1].

--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
 from ..layer_helper import LayerHelper
 __all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc',
+    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
    'templatedoc'
 ]
@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
        buf.write('\n')
    skip_attrs = OpProtoHolder.generated_op_attr_names()
+    # attr use_mkldnn and is_test also should not be visible to users.
+    skip_attrs.add("use_mkldnn")
+    skip_attrs.add("is_test")
    for each_attr in op_proto.attrs:
        if each_attr.name in skip_attrs:
@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
    return func
-def generate_layer_fn_noattr(op_type):
+def generate_activation_fn(op_type):
    """Register the Python layer for an Operator without Attribute.
    Args:
@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
    func.__name__ = op_type
    func.__doc__ = _generate_doc_string_(op_proto)
    return func

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3236,7 +3236,7 @@ def group_norm(input,
    # create output
    mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
    variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_variable(dtype)
+    group_norm_out = helper.create_variable(dtype=dtype)
    helper.append_op(
        type="group_norm",
@@ -5936,13 +5936,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                                than :attr:`shape`.
        act (str): The non-linear activation to be applied to the reshaped tensor
                   variable.
-        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
+        inplace(bool): If ``inplace`` is `True`, the input and output of ``layers.reshape``
-                       operators. If this flag is set :attr:`True`, reuse input
+                       are the same variable, otherwise, the input and output of
-                       :attr:`x` to reshape, which will change the shape of
+                       ``layers.reshape`` are different variables. Note that if :attr:`x`
-                       tensor variable :attr:`x` and might cause errors when
+                       is more than one layer's input, ``inplace`` must be :attr:`False`.
-                       :attr:`x` is used in multiple operators. If :attr:`False`,
-                       preserve the shape :attr:`x` and create a new output tensor
-                       variable whose data is copied from input x but reshaped.
        name (str): The name of this layer. It is optional.
    Returns:
@@ -8335,6 +8332,46 @@ def stack(x, axis=0):
    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
    If :code:`axis` is None, it would be replaced with 0.
+    For Example:
+    .. code-block:: text
+        Case 1:
+          Input:
+            x[0].data = [ [1.0 , 2.0 ] ]
+            x[0].dims = [1, 2]
+            x[1].data = [ [3.0 , 4.0 ] ]
+            x[1].dims = [1, 2]
+            x[2].data = [ [5.0 , 6.0 ] ]
+            x[2].dims = [1, 2]
+          Attrs:
+            axis = 0
+          Output:
+            Out.data =[ [ [1.0, 2.0] ],
+                        [ [3.0, 4.0] ],
+                        [ [5.0, 6.0] ] ]
+            Out.dims = [3, 1, 2]
+        Case 2:
+          Given
+            x[0].data = [ [1.0 , 2.0 ] ]
+            x[0].dims = [1, 2]
+            x[1].data = [ [3.0 , 4.0 ] ]
+            x[1].dims = [1, 2]
+            x[2].data = [ [5.0 , 6.0 ] ]
+            x[2].dims = [1, 2]
+          Attrs:
+            axis = 1 or axis = -2
+          Output:
+            Out.data =[ [ [1.0, 2.0]
+                          [3.0, 4.0]
+                          [5.0, 6.0] ] ]
+            Out.dims = [1, 3, 2]
    Args:
        x (Variable|list(Variable)|tuple(Variable)): Input variables.
        axis (int|None): The axis along which all inputs are stacked.

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
+from .layer_function_generator import generate_layer_fn, generate_activation_fn
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 __all__ += __activations_noattr__
 for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_layer_fn_noattr(_OP)
+    globals()[_OP] = generate_activation_fn(_OP)
 __all__ += ["uniform_random"]

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -567,7 +567,7 @@ def ones(shape, dtype, force_cpu=False):
    It also sets *stop_gradient* to True.
    Args:
-        shape(tuple|list|None): Shape of output tensor
+        shape(tuple|list): Shape of output tensor
        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
    Returns:
@@ -578,6 +578,10 @@ def ones(shape, dtype, force_cpu=False):
          data = fluid.layers.ones(shape=[1], dtype='int64')
    """
+    assert isinstance(shape, list) or isinstance(
+        shape, tuple), "The shape's type should be list or tuple."
+    assert reduce(lambda x, y: x * y,
+                  shape) > 0, "The shape is invalid: %s." % (str(shape))
    return fill_constant(value=1.0, **locals())

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):
    Args:
        learning_rate (float|Variable): global learning rate.
-        l1 (float):
+        l1 (float): L1 regularization strength.
-        l2 (float):
+        l2 (float): L2 regularization strength.
-        lr_power (float):
+        lr_power (float): Learning Rate Power.
        regularization: A Regularizer, such as
                        fluid.regularizer.L2DecayRegularizer.
        name: A optional name prefix.

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -148,6 +148,8 @@ class ParallelExecutor(object):
            else framework.default_main_program()
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
+        if build_strategy.memory_optimize is None:
+            build_strategy.memory_optimize = False if main._is_mem_optimized else True
        if build_strategy.enable_inplace is None:
            build_strategy.enable_inplace = False if main._is_mem_optimized else True
        scope = scope if scope is not None else executor.global_scope()

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
+list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -107,6 +108,9 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+if(NOT WIN32)
+py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
+endif()
 if(NOT APPLE)
    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
    if(CMAKE_BUILD_TYPE STREQUAL "Debug")

--- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
@@ -15,39 +15,7 @@
 from __future__ import print_function
 import unittest
-import numpy as np
+from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
-from paddle.fluid.tests.unittests.op_test import OpTest
-class TestNGRAPHAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.dtype = np.float32
-        self.init_dtype()
-        n = 128
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1))
-        label = np.random.randint(0, 2, (n, 1))
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int64"),
-            'Total': np.array([n]).astype("int64")
-        }
-        self._cpu_only = True
-    def init_dtype(self):
-        pass
-    def test_check_output(self):
-        self.check_output()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
@@ -17,21 +17,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference
-class TestNGRAPHBatchNormOpTraining(TestBatchNormOpTraining):
-    def init_kernel_type(self):
-        super(TestNGRAPHBatchNormOpTraining, self).init_kernel_type()
-class TestNGRAPHBatchNormOpInference(TestBatchNormOpInference):
-    def init_kernel_type(self):
-        super(TestNGRAPHBatchNormOpInference, self).init_kernel_type()
-class TestNGRAPHBatchNormOpWithReluInference(TestBatchNormOpInference):
-    def init_kernel_type(self):
-        super(TestNGRAPHBatchNormOpWithReluInference, self).init_kernel_type()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
@@ -17,60 +17,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
-class TestNGRAPH(TestConv2dOp):
-    def setUp(self):
-        super(TestNGRAPH, self).setUp()
-        self._cpu_only = True
-    def init_kernel_type(self):
-        super(TestNGRAPH, self).init_kernel_type()
-class TestNGRAPHWithPad(TestWithPad):
-    def setUp(self):
-        super(TestNGRAPHWithPad, self).setUp()
-        self._cpu_only = True
-    def init_kernel_type(self):
-        super(TestNGRAPHWithPad, self).init_kernel_type()
-class TestNGRAPHWithStride(TestWithStride):
-    def setUp(self):
-        super(TestNGRAPHWithStride, self).setUp()
-        self._cpu_only = True
-    def init_kernel_type(self):
-        super(TestNGRAPHWithStride, self).init_kernel_type()
-class TestNGRAPHWithGroup(TestWithGroup):
-    def setUp(self):
-        super(TestNGRAPHWithGroup, self).setUp()
-        self._cpu_only = True
-    def init_kernel_type(self):
-        super(TestNGRAPHWithGroup, self).init_kernel_type()
-class TestNGRAPHWith1x1(TestWith1x1):
-    def setUp(self):
-        super(TestNGRAPHWith1x1, self).setUp()
-        self._cpu_only = True
-    def init_kernel_type(self):
-        super(TestNGRAPHWith1x1, self).init_kernel_type()
-class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def setUp(self):
-        super(TestNGRAPHWithInput1x1Filter1x1, self).setUp()
-        self._cpu_only = True
-    def init_kernel_type(self):
-        super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, randomize_probability
+class TestCrossEntropyOp(OpTest):
+    """Test cross-entropy with discrete one-hot labels.
+    """
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        self.soft_label = False
+        self.ignore_index = -100
+        self.dtype = np.float64
+        self.batch_size = 30
+        self.class_num = 10
+        self._cpu_only = True
+        self.init_dtype_type()
+        self.init_attr_type()
+        self.init_bs_class_num()
+        self.init_x()
+        self.init_label()
+        self.get_cross_entropy()
+        self.inputs = {"X": self.x, "Label": self.label}
+        self.outputs = {"Y": self.cross_entropy}
+        self.attrs = {
+            "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index
+        }
+    def init_x(self):
+        self.x = randomize_probability(
+            self.batch_size, self.class_num, dtype=self.dtype)
+    def init_label(self):
+        self.label = np.random.randint(
+            0, self.class_num, (self.batch_size, 1), dtype="int64")
+    def get_cross_entropy(self):
+        self.cross_entropy = np.asmatrix(
+            [[-np.log(self.x[i][self.label[i][0]])]
+             for i in range(self.x.shape[0])],
+            dtype="float64")
+    def init_attr_type(self):
+        pass
+    def init_dtype_type(self):
+        pass
+    def init_bs_class_num(self):
+        pass
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+class TestCrossEntropyOp2(TestCrossEntropyOp):
+    """Test cross-entropy with vectorized soft labels.
+    """
+    def init_label(self):
+        self.label = np.random.uniform(
+            0.1, 1.0, [self.batch_size, self.class_num]).astype(self.dtype)
+        self.label /= self.label.sum(axis=1, keepdims=True)
+    def get_cross_entropy(self):
+        self.cross_entropy = (-self.label * np.log(self.x)).sum(
+            axis=1, keepdims=True).astype(self.dtype)
+    def init_attr_type(self):
+        self.soft_label = True
+    def init_dtype_type(self):
+        self.dtype = np.float32
+    def init_bs_class_num(self):
+        self.batch_size = 5
+        self.class_num = 37
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+class TestCrossEntropyOp3(TestCrossEntropyOp):
+    """Test cross-entropy with vectorized one-hot representation of labels.
+    """
+    def init_label(self):
+        self.label_index = np.random.randint(0, self.class_num,
+                                             (self.batch_size))
+        self.label = np.zeros(self.x.shape).astype(self.dtype)
+        self.label[np.arange(self.batch_size), self.label_index] = 1
+    def get_cross_entropy(self):
+        self.cross_entropy = np.asmatrix(
+            [[-np.log(self.x[i][self.label_index[i]])]
+             for i in range(self.x.shape[0])]).astype(self.dtype)
+    def init_attr_type(self):
+        self.soft_label = True
+    def init_dtype_type(self):
+        self.dtype = np.float32
+    def init_bs_class_num(self):
+        self.batch_size = 5
+        self.class_num = 17
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+class TestCrossEntropyOp4(TestCrossEntropyOp):
+    """Test high rank tensor cross-entropy with discrete one-hot labels.
+    """
+    def init_x(self):
+        self.shape = [10, 2, 4]
+        self.ins_num = np.prod(np.array(self.shape))
+        self.X_2d = randomize_probability(self.ins_num,
+                                          self.class_num).astype(self.dtype)
+        self.x = self.X_2d.reshape(self.shape + [self.class_num])
+    def init_label(self):
+        self.label_2d = np.random.randint(
+            0, self.class_num, (self.ins_num, 1), dtype="int64")
+        self.label = self.label_2d.reshape(self.shape + [1])
+    def get_cross_entropy(self):
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(self.X_2d[i][self.label_2d[i][0]])]
+             for i in range(self.X_2d.shape[0])]).astype(self.dtype)
+        self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
+                                                                [1])
+    def init_attr_type(self):
+        self.soft_label = False
+    def init_dtype_type(self):
+        self.dtype = np.float64
+    def init_bs_class_num(self):
+        self.class_num = 10
+class TestCrossEntropyOp5(TestCrossEntropyOp):
+    """Test high rank tensor cross-entropy with vectorized soft labels.
+    """
+    def init_x(self):
+        self.shape = [4, 3]
+        self.ins_num = np.prod(np.array(self.shape))
+        self.X_2d = randomize_probability(self.ins_num,
+                                          self.class_num).astype(self.dtype)
+        self.x = self.X_2d.reshape(self.shape + [self.class_num])
+    def init_label(self):
+        self.label_2d = np.random.uniform(
+            0.1, 1.0, [self.ins_num, self.class_num]).astype(self.dtype)
+        self.label_2d /= self.label_2d.sum(axis=1, keepdims=True)
+        self.label = self.label_2d.reshape(self.shape + [self.class_num])
+    def get_cross_entropy(self):
+        cross_entropy_2d = (-self.label_2d * np.log(self.X_2d)).sum(
+            axis=1, keepdims=True).astype(self.dtype)
+        self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
+                                                                [1])
+    def init_attr_type(self):
+        self.soft_label = True
+    def init_dtype_type(self):
+        self.dtype = np.float32
+    def init_bs_class_num(self):
+        self.class_num = 37
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+class TestCrossEntropyOp6(TestCrossEntropyOp):
+    """Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
+    """
+    def init_x(self):
+        self.shape = [4, 3, 2]
+        self.ins_num = np.prod(np.array(self.shape))
+        self.X_2d = randomize_probability(self.ins_num,
+                                          self.class_num).astype(self.dtype)
+        self.x = self.X_2d.reshape(self.shape + [self.class_num])
+    def init_label(self):
+        self.label_index_2d = np.random.randint(
+            0, self.class_num, (self.ins_num), dtype="int64")
+        label_2d = np.zeros(self.X_2d.shape)
+        label_2d[np.arange(self.ins_num), self.label_index_2d] = 1
+        self.label = label_2d.reshape(self.shape + [self.class_num]).astype(
+            self.dtype)
+    def get_cross_entropy(self):
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(self.X_2d[i][self.label_index_2d[i]])]
+             for i in range(self.X_2d.shape[0])])
+        self.cross_entropy = np.array(cross_entropy_2d).reshape(
+            self.shape + [1]).astype(self.dtype)
+    def init_attr_type(self):
+        self.soft_label = True
+    def init_dtype_type(self):
+        self.dtype = np.float32
+    def init_bs_class_num(self):
+        self.class_num = 17
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+class TestCrossEntropyOp7(TestCrossEntropyOp):
+    """Test cross-entropy with ignore index.
+    """
+    def init_label(self):
+        self.label = np.random.randint(
+            0, self.class_num, (self.batch_size, 1), dtype="int64")
+    def get_cross_entropy(self):
+        self.cross_entropy = np.asmatrix(
+            [[-np.log(self.x[i][self.label[i][0]])]
+             if self.label[i][0] != self.ignore_index else [0]
+             for i in range(self.x.shape[0])]).astype(self.dtype)
+    def init_attr_type(self):
+        self.soft_label = False
+        self.ignore_index = 3
+    def init_dtype_type(self):
+        self.dtype = np.float64
+    def init_bs_class_num(self):
+        self.batch_size = 30
+        self.class_num = 10
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
@@ -13,18 +13,9 @@
 # limitations under the License.
 from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
-class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp):
-    def setUp(self):
-        super(TestNGRAPHElementwiseAddOp, self).setUp()
-        self._cpu_only = True
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp, self).init_input_output()
+import unittest
+from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp, TestElementwiseAddOp_broadcast_0
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
@@ -13,24 +13,34 @@
 # limitations under the License.
 from __future__ import print_function
 import unittest
+import numpy as np
 from paddle.fluid.tests.unittests.test_fill_constant_op import TestFillConstantOp1, TestFillConstantOp2, TestFillConstantOpWithSelectedRows
-class TestNGRAPHFillConstantOp1(TestFillConstantOp1):
+class TestNGRAPHFillConstantFP64(TestFillConstantOp1):
    def setUp(self):
-        super(TestNGRAPHFillConstantOp1, self).setUp()
+        super(TestNGRAPHFillConstantFP64, self).setUp()
+        self.attrs = {'shape': [123, 92], 'value': 3.8, 'dtype': 6}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
-class TestNGRAPHFillConstantOp2(TestFillConstantOp2):
+class TestNGRAPHFillConstantINT32(TestFillConstantOp2):
    def setUp(self):
-        super(TestNGRAPHFillConstantOp2, self).setUp()
+        super(TestNGRAPHFillConstantINT32, self).setUp()
+        self.attrs = {'shape': [123, 92], 'dtype': 2}
+        self.outputs = {'Out': np.full((123, 92), 0)}
-class TestNGRAPHFillConstantOpWithSelectedRows(
-        TestFillConstantOpWithSelectedRows):
+class TestNGRAPHFillConstantINT64(TestFillConstantOp2):
    def setUp(self):
-        super(TestFillConstantOpWithSelectedRows, self).setUp()
+        super(TestNGRAPHFillConstantINT64, self).setUp()
+        self.attrs = {'shape': [123, 92], 'dtype': 3}
+        self.outputs = {'Out': np.full((123, 92), 0)}
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
@@ -16,12 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp
-class TestNGRAPHMeanOp(TestMeanOp):
-    def setUp(self):
-        super(TestNGRAPHMeanOp, self).setUp()
-        self._cpu_only = True
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
@@ -15,39 +15,7 @@
 from __future__ import print_function
 import unittest
-import numpy as np
+from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2
-from paddle.fluid.tests.unittests.op_test import OpTest
-class TestNGRAPHMulOp(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((2, 4)).astype(self.dtype),
-            'Y': np.random.random((4, 4)).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-        self._cpu_only = True
-    def init_dtype_type(self):
-        pass
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
@@ -14,61 +14,25 @@
 from __future__ import print_function
-from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+import unittest
-class TestNGRAPHPool2D_Op(TestPool2D_Op):
-    def setUp(self):
-        super(TestNGRAPHPool2D_Op, self).setUp()
-        self._cpu_only = True
-    def init_test_case(self):
-        super(TestNGRAPHPool2D_Op, self).init_test_case()
-class TestNGRAPHCase1(TestCase1):
-    def setUp(self):
-        super(TestNGRAPHCase1, self).setUp()
-        self._cpu_only = True
-    def init_test_case(self):
-        super(TestNGRAPHCase1, self).init_test_case()
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
-class TestNGRAPHCase2(TestCase2):
-    def setUp(self):
-        super(TestNGRAPHCase2, self).setUp()
-        self._cpu_only = True
-    def init_test_case(self):
-        super(TestNGRAPHCase2, self).init_test_case()
-class TestNGRAPHCase3(TestCase3):
-    def setUp(self):
-        super(TestNGRAPHCase3, self).setUp()
-        self._cpu_only = True
-    def init_pool_type(self):
-        super(TestNGRAPHCase3, self).init_pool_type()
-class TestNGRAPHCase4(TestCase4):
+class TestNGRAPHCeilMode(TestCase1):
    def setUp(self):
-        super(TestNGRAPHCase4, self).setUp()
+        super(TestNGRAPHCeilMode, self).setUp()
-        self._cpu_only = True
-    def init_pool_type(self):
+    def init_ceil_mode(self):
-        super(TestNGRAPHCase4, self).init_pool_type()
+        self.ceil_mode = True
-class TestNGRAPHCase5(TestCase5):
+class TestNGRAPHAdaptive(TestCase1):
    def setUp(self):
-        super(TestNGRAPHCase5, self).setUp()
+        super(TestNGRAPHAdaptive, self).setUp()
-        self._cpu_only = True
-    def init_pool_type(self):
+    def init_adaptive(self):
-        super(TestNGRAPHCase5, self).init_pool_type()
+        self.adaptive = True
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
@@ -15,24 +15,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows
-class TestNGRAPHScaleOp(TestScaleOp):
-    def setUp(self):
-        super(TestNGRAPHScaleOp, self).setUp()
-        self._cpu_only = True
-    def init_dtype_type(self):
-        pass
-class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
-    def setUp(self):
-        super(TestNGRAPHScaleOpSelectedRows, self).setUp()
-        self._cpu_only = True
-    def init_dtype_type(self):
-        pass
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
@@ -16,11 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp
-class TestSoftmaxNGRAPHOp(TestSoftmaxOp):
-    def setUp(self):
-        super(TestSoftmaxNGRAPHOp, self).setUp()
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
@@ -16,30 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
-class TestNGRAPHTopkOp(TestTopkOp):
-    def setUp(self):
-        super(TestNGRAPHTopkOp, self).setUp()
-        self._cpu_only = True
-class TestNGRAPHTopkOp2(TestTopkOp2):
-    def setUp(self):
-        super(TestNGRAPHTopkOp2, self).setUp()
-        self._cpu_only = True
-class TestNGRAPHTopkOp3(TestTopkOp3):
-    def setUp(self):
-        super(TestNGRAPHTopkOp3, self).setUp()
-        self._cpu_only = True
-class TestNGRAPHTopkOp4(TestTopkOp4):
-    def setUp(self):
-        super(TestNGRAPHTopkOp4, self).setUp()
-        self._cpu_only = True
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
+import os
 import unittest
 import numpy as np
 import random
@@ -374,6 +375,9 @@ class OpTest(unittest.TestCase):
                return []
        places = [fluid.CPUPlace()]
        cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
+        use_ngraph = bool(os.getenv("FLAGS_use_ngraph", False))
+        if use_ngraph:
+            cpu_only = True
        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
           and not cpu_only:
            places.append(core.CUDAPlace(0))

--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.memory_optimize = use_ir_memory_optimize
+        build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
        # python memory optimization is conflict with inplace pass.
        # Use ir graph memory optimization after inplace pass is the correct way.
        build_strategy.enable_inplace = False if memory_opt else enable_inplace

--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+class L1(fluid.imperative.Layer):
+    def __init__(self):
+        super(L1, self).__init__()
+        self._helper = LayerHelper(
+            'MyLayer',
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)))
+        self.w1 = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=[2, 2],
+            dtype='float32',
+            is_bias=False)
+        self.w2 = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=[2, 2],
+            dtype='float32',
+            is_bias=False)
+    def forward(self):
+        return self.w1 + self.w2
+class L2(fluid.imperative.Layer):
+    def __init__(self):
+        super(L2, self).__init__()
+        self.layer1 = L1()
+        self.layer2 = L1()
+    def forward(self):
+        return self.layer1() + self.layer2()
+class L3(fluid.imperative.Layer):
+    def __init__(self):
+        super(L3, self).__init__()
+        self.layer1 = L2()
+        self.layer2 = L2()
+    def forward(self):
+        return self.layer1() + self.layer2()
+class TestBaseLayer(unittest.TestCase):
+    def test_one_level(self):
+        with fluid.imperative.guard():
+            l = L1()
+            ret = l()
+            self.assertEqual(l.w1.name, "MyLayer_0.w_0")
+            self.assertEqual(l.w2.name, "MyLayer_0.w_1")
+            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
+    def test_three_level(self):
+        with fluid.imperative.guard():
+            l = L3()
+            ret = l()
+            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -22,6 +22,9 @@ import six
 import unittest
 import numpy as np
+import gc
+gc.set_debug(gc.DEBUG_COLLECTABLE)
 import paddle.fluid as fluid
@@ -99,6 +102,12 @@ class TranspilerTest(unittest.TestCase):
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
                self.transpiler_test_impl()
+        # NOTE: run gc.collect to eliminate pybind side objects to
+        # prevent random double-deallocate when inherited in python.
+        del self.transpiler
+        del main
+        del startup
+        gc.collect()
 class TestBasicModel(TranspilerTest):
@@ -797,6 +806,7 @@ class TestNCCL2Transpile(TranspilerTest):
            print([op.type for op in startup.global_block().ops])
            self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
            self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
+            gc.collect()
        else:
            pass

--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
                regularization=fluid.regularizer.L2Decay(1e-6))
            return optimizer
+        # NOTE(dzh):
+        # need to make it compatible with elewise fuse act
        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
            model,
            feed_dict={"image": img,
@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
            use_cuda=use_cuda,
            fuse_elewise_add_act_ops=False,
            memory_opt=False,
+            use_ir_memory_optimize=False,
            optimizer=_optimizer)
        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
            model,
@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
            use_cuda=use_cuda,
            fuse_elewise_add_act_ops=True,
            memory_opt=False,
+            use_ir_memory_optimize=False,
            optimizer=_optimizer)
        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):

--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -333,6 +333,18 @@ class TestImperative(unittest.TestCase):
        self.assertTrue(np.allclose(dy_out, static_out))
        self.assertTrue(np.allclose(dy_grad, static_grad))
+        params = mlp.parameters(True)
+        self.assertEqual("FC_0.w_0", params[0].name)
+        self.assertEqual("FC_0.b_0", params[1].name)
+        self.assertEqual("FC_1.w_0", params[2].name)
+        self.assertEqual("FC_1.b_0", params[3].name)
+        self.assertEqual(len(params), 4)
+        sublayers = mlp.sublayers(True)
+        self.assertEqual(mlp._fc1, sublayers[0])
+        self.assertEqual(mlp._fc2, sublayers[1])
+        self.assertEqual(len(sublayers), 2)
    def test_rnn(self):
        np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
                           [10.0, 11.0, 12.0]])

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -33,9 +33,6 @@ class Discriminator(fluid.imperative.Layer):
        self._fc1 = FC(size=32, act='elu', name="d_fc1")
        self._fc2 = FC(size=1, name="d_fc2")
-    def parameters(self):
-        return self._fc1.parameters() + self._fc2.parameters()
    def forward(self, inputs):
        x = self._fc1(inputs)
        return self._fc2(x)
@@ -48,10 +45,6 @@ class Generator(fluid.imperative.Layer):
        self._fc2 = FC(size=64, act='elu', name="g_fc2")
        self._fc3 = FC(size=1, name="g_fc3")
-    def parameters(self):
-        return self._fc1.parameters() + self._fc2.parameters(
-        ) + self._fc3.parameters()
    def forward(self, inputs):
        x = self._fc1(inputs)
        x = self._fc2(x)

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -75,16 +75,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
            self.hidden_array.append(pre_hidden)
            self.cell_array.append(pre_cell)
-    def parameters(self):
-        parameters = list()
-        for param in self.weight_1_arr:
-            parameters.append(param)
-        for param in self.weight_2_arr:
-            parameters.append(param)
-        for bias in self.bias_arr:
-            parameters.append(bias)
-        return parameters
    def forward(self, input_embedding, init_hidden=None, init_cell=None):
        res = []
        for index in range(self._num_steps):
@@ -177,12 +167,6 @@ class PtbModel(fluid.imperative.Layer):
    def _build_once(self, input, label, init_hidden, init_cell):
        pass
-    def parameters(self):
-        parameters = self.simple_lstm_rnn.parameters() + [
-            self.softmax_weight, self.softmax_bias
-        ] + self.embedding.parameters()
-        return parameters
    def forward(self, input, label, init_hidden, init_cell):
        init_h = fluid.layers.reshape(

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,7 +21,6 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.imperative.base import to_variable
 from test_imperative_base import new_program_scope
@@ -173,11 +172,13 @@ class ResNet(fluid.imperative.Layer):
        for block in range(len(depth)):
            shortcut = False
            for i in range(depth[block]):
-                bottleneck_block = BottleneckBlock(
+                bottleneck_block = self.add_sublayer(
-                    num_channels=num_channels,
+                    'bb_%d_%d' % (block, i),
-                    num_filters=num_filters[block],
+                    BottleneckBlock(
-                    stride=2 if i == 0 and block != 0 else 1,
+                        num_channels=num_channels,
-                    shortcut=shortcut)
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
                num_channels = bottleneck_block._num_channels_out
                self.bottleneck_block_list.append(bottleneck_block)
                shortcut = True
@@ -223,8 +224,7 @@ class TestImperativeResnet(unittest.TestCase):
                batch_size=batch_size)
            dy_param_init_value = {}
-            for param in fluid.default_main_program().global_block(
+            for param in resnet.parameters():
-            ).all_parameters():
                dy_param_init_value[param.name] = param._numpy()
            for batch_id, data in enumerate(train_reader()):
@@ -247,16 +247,14 @@ class TestImperativeResnet(unittest.TestCase):
                dy_out = avg_loss._numpy()
                if batch_id == 0:
-                    for param in fluid.default_main_program().global_block(
+                    for param in resnet.parameters():
-                    ).all_parameters():
                        if param.name not in dy_param_init_value:
                            dy_param_init_value[param.name] = param._numpy()
                avg_loss._backward()
                dy_grad_value = {}
-                for param in fluid.default_main_program().global_block(
+                for param in resnet.parameters():
-                ).all_parameters():
                    if not param.stop_gradient:
                        np_array = np.array(param._ivar._grad_ivar().value()
                                            .get_tensor())
@@ -267,8 +265,7 @@ class TestImperativeResnet(unittest.TestCase):
                resnet.clear_gradients()
                dy_param_value = {}
-                for param in fluid.default_main_program().global_block(
+                for param in resnet.parameters():
-                ).all_parameters():
                    dy_param_value[param.name] = param._numpy()
        with new_program_scope():
@@ -349,6 +346,7 @@ class TestImperativeResnet(unittest.TestCase):
        self.assertTrue(np.allclose(static_out, dy_out))
        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
            self.assertTrue(np.isfinite(value.all()))

--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
 requests==2.9.2
 numpy>=1.12
-protobuf==3.1
+protobuf>=3.6
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile

--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh