Merge develop

test=develop

Merge develop
test=develop
16f09947 · sneaxiy · 63651c19 · 57dc3c19 · 16f09947 · 16f09947
178 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
+# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
@@ -190,6 +191,7 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+    include(anakin_subgraph)
 endif()
 if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -156,7 +156,7 @@ python \
 This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/)
--- a/cmake/anakin_subgraph.cmake
+++ b/cmake/anakin_subgraph.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
+find_path(ANAKIN_INCLUDE_DIR anakin_config.h
+    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
+    NO_DEFAULT_PATH
+)
+find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
+    PATHS ${ANAKIN_ROOT}
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to ANAKIN library.")
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+  if(WITH_DSO)
+    set(ANAKIN_FOUND ON)
+  endif(WITH_DSO)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+if(ANAKIN_FOUND)
+    message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT}/include)
+    include_directories(${ANAKIN_ROOT}/include/saber)
+    link_directories(${ANAKIN_ROOT})
+    add_definitions(-DPADDLE_WITH_ANAKIN)
+endif()
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
    include_directories(${TENSORRT_INCLUDE_DIR})
+    link_directories(${TENSORRT_LIBRARY})
    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -520,6 +520,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
+paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
 paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
 paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
 paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
@@ -72,7 +73,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -52,13 +53,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
  //               Note that must assert topology sort is stable
  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  for (auto* op_desc : ops) {
-    auto outputs = op_desc->Outputs();
+    try {
-    for (auto& o_it : outputs) {
+      bool is_bk_op =
-      for (auto& v : o_it.second) {  // values
+          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-        vars[v] = order;
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+      auto backward_vars =
+          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+      auto outputs = op_desc->Outputs();
+      for (auto& o_it : outputs) {
+        for (auto& v : o_it.second) {  // values
+          vars[v] = order;
+          VLOG(1) << "in all_reduce_deps_pass:" << v;
+        }
      }
+      order++;
+    } catch (boost::bad_get e) {
    }
-    order++;
  }
  std::vector<OpHandleBase*> dist_ops;

--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
+#include <string>
+namespace paddle {
+namespace framework {
+namespace details {
+FetchBarrierOpHandle::FetchBarrierOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    // fetch_barrier op always run on place0, but output on all places.
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(*node->Op())),
+      local_scopes_(local_scopes),
+      places_(places),
+      run_scope_(local_scopes[0]),
+      place_(places[0]) {
+  for (auto &p : places) {
+    this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p));
+  }
+}
+bool FetchBarrierOpHandle::IsMultiDeviceTransfer() {
+  // override IsMultiDeviceTransfer to return true
+  return true;
+}
+void FetchBarrierOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+  auto run_func = [this]() {
+    op_->Run(*run_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  };
+  if (is_lock_and_record_event_free_) {
+    run_func();
+  } else {
+    this->RunAndRecordEvent(run_func);
+  }
+}
+bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) {
+  bool need_wait =
+      in_var && in_var->GeneratedOp() &&
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
+  return need_wait;
+}
+std::string FetchBarrierOpHandle::Name() const { return op_->Type(); }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+// **NOTE**: fetch_barrier op is special it outputs all recved variables on
+// all places if there are multiple places, must init with
+// multiple dev_ctxes_ !!!!
+struct FetchBarrierOpHandle : public OpHandleBase {
+ public:
+  FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
+                       const std::vector<platform::Place> &places);
+  bool IsMultiDeviceTransfer() override;
+  std::string Name() const override;
+ protected:
+  void RunImpl() override;
+  bool NeedWait(VarHandleBase *in_var) override;
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  Scope *run_scope_;
+  platform::Place place_;
+  bool is_lock_and_record_event_free_{false};
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -17,6 +17,8 @@
 #include <deque>
 #include <iterator>
 #include <memory>
+#include <queue>
+#include <sstream>
 #include <stack>
 #include <string>
 #include <unordered_map>
@@ -148,12 +150,14 @@ std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
  view_.Build(graph.get());
  InitSSAGraphNodes();
+  auto cnt = 0;
  for (auto* op : view_.AllOps()) {
+    VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
    if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
      continue;
    TryInplaceOpInputOutput(op, graph.get());
  }
-  graph->ResolveHazard(var_nodes_);
+  // graph->ResolveHazard(var_nodes_);
  return graph;
 }
@@ -264,13 +268,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                          ir::Graph* graph) const {
  VLOG(4) << "Try to inplace op " << op->Name();
-  // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
+  // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-  // ProgramDescs.
+  //               "op_desc is nullptr");
-  // The operations related to BlockDesc or ProgramDesc should perform on Graph
-  // or Node directly!
-  PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-                 "op_desc is nullptr");
  // some pre-requirments need to meet if the op want to inplaced.
+  PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
  auto* op_desc = op->Op();
  auto& infer_inplace =
@@ -281,21 +282,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
  PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
                 "%s's infer_inplace has not been registered", op_desc->Type());
-  auto* block = op_desc->Block();
+  auto in_to_outs = infer_inplace(*op_desc);
-  auto in_to_outs = infer_inplace(*op_desc, block);
  auto& all_ops = view_.AllOps();
  auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
  size_t idx = std::distance(all_ops.begin(), cursor);
  for (auto& pair : in_to_outs) {
-    auto& in_var_name = pair.first;
+    auto& in_para_name = pair.first;
-    auto& out_var_name = pair.second;
+    auto& out_para_name = pair.second;
+    auto input_vars = op->Op()->Input(in_para_name);
+    if (!input_vars.size()) {
+      VLOG(4) << "Parameter " << in_para_name << " is empty skip "
+              << in_para_name << " => " << out_para_name << " pair";
+      continue;
+    }
+    auto output_vars = op->Op()->Output(out_para_name);
+    if (!output_vars.size()) {
+      VLOG(4) << "Parameter " << out_para_name << " is empty skip "
+              << in_para_name << " => " << out_para_name << " pair";
+      continue;
+    }
+    auto in_var_name = input_vars.at(0);
+    auto out_var_name = output_vars.at(0);
    auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
    auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
+    VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
+    bool can_replace = true;
+    if (in_var_name == out_var_name) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable "
+              << out_var_name << " are the same";
+    } else if (!NodeCanReused(in_node)) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
+    } else if (!NodeCanReused(out_node)) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Output variable " << out_var_name
+              << " cannot be reused";
+    } else if (details::NodeSize(*in_node->Var()) !=
+               details::NodeSize(*out_node->Var())) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input and Output varialbe size not match";
+    }
+    if (!can_replace) continue;
    // 2. there is no external pending op on the input node
-    if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    // if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
      VLOG(4) << string::Sprintf(
          "Skiped pair %s => %s. %s input has external dependency."
          "inplace such pair will overwrite the memory.",
@@ -342,6 +380,97 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
  }
 }
+void GraphView::TopoSort(ir::Graph* graph) {
+  //
+  ops_.clear();
+  auto deps_num = [](ir::Node* op) {
+    auto cnt = 0;
+    for (auto& var : op->inputs)
+      if (var->inputs.size() > 0) ++cnt;
+    return cnt;
+  };
+  std::queue<std::pair<ir::Node*, uint32_t>> ready_ops;
+  int level = 0;
+  auto nodes = graph->Nodes();
+  std::unordered_map<ir::Node*, uint32_t> deps_map;
+  for (auto& node : nodes) {
+    if (node->IsOp() && node->Op() != nullptr) {
+      deps_map[node] = deps_num(node);
+      if (0 == deps_map[node]) {
+        ready_ops.push({node, level});
+      }
+    }
+  }
+  while (!ready_ops.empty()) {
+    auto item = ready_ops.front();
+    ready_ops.pop();
+    ops_.emplace_back(item.first);
+    // record level when pop from queue
+    op_level_[item.first] = item.second;
+    for (auto node : item.first->outputs) {
+      for (auto op : node->outputs) {
+        --deps_map[op];
+        if (deps_map[op] == 0) ready_ops.push({op, item.second + 1});
+      }
+    }
+  }
+  bool all_ops_checked = true;
+  for (auto& node : nodes) {
+    if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
+      all_ops_checked = false;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis");
+}
+// return true if current op node depeneds on all other op that use the same
+// variable node
+bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
+  // get op list that rely on the same variable
+  auto op_list = var->outputs;
+  for (auto& op : op_list) {
+    if (op == current_op) continue;
+    VLOG(4) << "    GraphView::CheckDeps : " << op->Name() << "  & "
+            << current_op->Name();
+    if (!CheckOpDeps(op, current_op)) return false;
+    VLOG(4) << "";
+  }
+  return true;
+}
+// check if op2 depends on op1's output
+bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
+  auto print_op = [&](ir::Node* op, const char* name) {
+    std::ostringstream os;
+    os << "        " << name << " : " << op->Name() << " ";
+    os << "Input args : ";
+    for (auto& arg : op->inputs) os << arg->Name() << " ";
+    os << "Output args : ";
+    for (auto& arg : op->outputs) os << arg->Name() << " ";
+    os << "Level : " << op_level_.at(op);
+    VLOG(4) << os.str();
+  };
+  print_op(op1, "OP1");
+  print_op(op2, "OP2");
+  if (op1 == op2) return true;
+  if (op_level_.at(op1) >= op_level_.at(op2)) return false;
+  for (auto& var : op2->inputs)
+    if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true;
+  return false;
+}
 ir::Node* GraphView::GetNodeByName(const std::string& name,
                                   const std::vector<ir::Node*>& nodes) const {
  // nodes should be op->inputs/outputs
@@ -387,22 +516,7 @@ void GraphView::Build(ir::Graph* g) {
  // Because we insert some new created node. Which may have data race between
  // nodes.
  // resolve data harzards depends on the var nodes in right order.
-  ops_ = SortOpLikeDescOrder(*g);
+  TopoSort(g);
-  // 1. track the nodes which reused previous node in Python memory optimize.
-  // these node can not be inplaced, otherwise may generate a circle in graph.
-  std::unordered_set<std::string> all_vars;
-  for (auto& node : g->Nodes()) {
-    if (node->IsVar()) continue;
-    for (auto& out : node->outputs) {
-      if (out->IsCtrlVar() || out->Var() == nullptr) continue;
-      if (all_vars.count(out->Name())) {
-        dup_nodes_.emplace(out->Name());
-      } else {
-        all_vars.emplace(out->Name());
-      }
-    }
-  }
  // 2. track the nodes which used by parameter server.
  // these node can not be inplaced, otherwise trainer

--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -14,6 +14,7 @@
 #pragma once
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -50,10 +51,15 @@ class GraphView {
  // map the parameter and gradient, must be skipped.
  bool InSkipSet(const std::string& var) const;
+  bool CheckDeps(ir::Node* var, ir::Node* current_op) const;
+  bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const;
+  void TopoSort(ir::Graph* g);
 private:
  std::vector<ir::Node*> ops_;
  std::unordered_set<std::string> dup_nodes_;  // mem opt affect nodes
  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
+  std::unordered_map<ir::Node*, uint32_t> op_level_;
 };
 // swap pairs in sequence

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -190,7 +190,7 @@ struct NodeComparator {
    auto rhs_shape = rhs_desc->GetShape();
    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
-      return NodeSize(lhs) <= NodeSize(rhs);
+      return NodeSize(lhs) == NodeSize(rhs);
    } else {
      return false;
    }
@@ -449,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() {
      live_in_[op].insert(var);
    }
    for (auto& var : defs_[op]) {
+      if (uses_[op].count(var)) continue;
      live_in_[op].erase(var);
    }

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -142,15 +142,16 @@ TEST(OrderedSet, FindBestFitNode) {
  for (auto& node : nodes) {
    pool.Insert(node.get());
  }
+  // FIXME(liuwei1031) this API has changed,
+  // disable these tests temporarily
  // FindNextBestFitNode
-  auto* n = nodes[0].get();
+  // auto* n = nodes[0].get();
-  auto* cache = pool.FindBestFitNode(n);
+  // auto* cache = pool.FindBestFitNode(n);
-  PADDLE_ENFORCE(cache->Name() == "a");
+  // PADDLE_ENFORCE(cache->Name() == "a");
-  cache = pool.FindNextBestFitNode(n, cache);
+  // cache = pool.FindNextBestFitNode(n, cache);
-  PADDLE_ENFORCE(cache->Name() == "c");
+  // PADDLE_ENFORCE(cache->Name() == "c");
-  cache = pool.FindNextBestFitNode(n, cache);
+  // cache = pool.FindNextBestFitNode(n, cache);
-  PADDLE_ENFORCE(cache->Name() == "b");
+  // PADDLE_ENFORCE(cache->Name() == "b");
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -851,9 +852,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                 node->Op()->Type());
-  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
-      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+  // Create fetch_barrier op handle to enable output on all devices.
-      node->Op()->Type(), places_[op_dev_id]));
+  // **NOTE** fetch_barrier should output variables list same as recv op does.
+  if (node->Op()->Type() == "fetch_barrier") {
+    result->Get<GraphOps>(kGraphOps).emplace_back(new FetchBarrierOpHandle(
+        result->CreateOpNode(node->Op()), local_scopes_, places_));
+  } else {
+    result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
+        result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+        node->Op()->Type(), places_[op_dev_id]));
+  }
  if (node->Op()->Type() == "send") {
    CreateOpHandleIOs(result, node, op_dev_id);

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -55,7 +55,7 @@ void OpHandleBase::Run(bool use_cuda) {
        if (out_var_handle) {
          int dev_id =
              boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
-          out_var_handle->SetGenerateEvent(events_[dev_id]);
+          out_var_handle->SetGenerateEvent(events_.at(dev_id));
        }
      }
    } else {
@@ -71,7 +71,7 @@ void OpHandleBase::Run(bool use_cuda) {
              "The place of input(%s) is not consistent with the "
              "place of current op(%s).",
              out_var_handle->Name(), Name());
-          out_var_handle->SetGenerateEvent(events_[dev_id]);
+          out_var_handle->SetGenerateEvent(events_.at(dev_id));
        }
      }
    }

--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -209,9 +209,9 @@ struct OpInfoFiller<T, kShapeInference> {
 template <typename T>
 struct OpInfoFiller<T, kInplaceOpInference> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) {
+    info->infer_inplace_ = [](const OpDesc& op_desc) {
      T infer;
-      return infer(op_desc, block);
+      return infer(op_desc);
    };
  }
 };

--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -17,8 +17,8 @@
 #include <numeric>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -32,55 +32,22 @@ namespace framework {
  then Out will inplaced use X's memory. The base class will do
  legality validation for both variables.
 */
 class InplaceOpInference {
 public:
  virtual ~InplaceOpInference() {}
  virtual std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const = 0;
+      const OpDesc& op_desc) const = 0;
-};
-class InplaceInToOut : public InplaceOpInference {
- public:
-  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const {
-    std::unordered_map<std::string, std::string> ret;
-    auto in_out_var_names_pair = this->Apply(op_desc, block);
-    for (auto& pair : in_out_var_names_pair) {
-      PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(),
-                     string::Sprintf("op %s do not have input of %s!",
-                                     op_desc.Type(), pair.first));
-      PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(),
-                     string::Sprintf("op %s do not have output of %s!",
-                                     op_desc.Type(), pair.second));
-      auto& in_name = op_desc.Input(pair.first).at(0);
-      auto& out_name = op_desc.Output(pair.second).at(0);
-      auto in = block->FindRecursiveOrCreateVar(in_name);
-      auto out = block->FindRecursiveOrCreateVar(out_name);
-      if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name});
-    }
-    return ret;
-  }
- protected:
-  virtual std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const = 0;
-  bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
-    return in.Name() != out.Name() && details::NodeCanReused(in) &&
-           details::NodeCanReused(out) &&
-           details::NodeSize(out) <= details::NodeSize(in);
-  }
 };
 /*
  Inplace In and Out for operator only have an Input and an Output.
  For example, activation op.
 */
-class SingleOpInplaceInToOut : public InplaceInToOut {
+class SingleOpInplaceInToOut : public InplaceOpInference {
- protected:
+ public:
-  std::unordered_map<std::string, std::string> Apply(
+  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const override {
+      const OpDesc& op_desc) const override {
    PADDLE_ENFORCE(!op_desc.InputNames().empty(),
                   "Op inputs must not be empty");
    PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
@@ -95,10 +62,10 @@ class SingleOpInplaceInToOut : public InplaceInToOut {
  Gradient op. Inplace output use it's Input.
  For example, Input@Grad->Input reuse strategy.
 */
-class GradOpInplaceInToOut : public InplaceInToOut {
+class GradOpInplaceInToOut : public InplaceOpInference {
- protected:
+ public:
-  std::unordered_map<std::string, std::string> Apply(
+  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const override {
+      const OpDesc& op_desc) const override {
    std::unordered_map<std::string, std::string> ret;
    std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
                                                 op_desc.OutputNames().end());

--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -127,26 +127,20 @@ class MultiOutGradShapeInference : public framework::InferShapeBase {
  }
 };
-class MultiOutInplaceInToOut : public framework::InplaceInToOut {
+class MultiOutInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"},
    };
  }
 };
-class MultiOutGradInplaceInToOut : public framework::InplaceInToOut {
+class MultiOutGradInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {framework::GradVarName("YOut"), framework::GradVarName("Y")},
        {framework::GradVarName("Out"), framework::GradVarName("X")},
@@ -171,118 +165,118 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
 namespace paddle {
 namespace framework {
-TEST(InferInplace, SingleOpInplaceInToOut) {
+// TEST(InferInplace, SingleOpInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("single_op");
+//   op->SetType("single_op");
-  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+//   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-  op->SetOutput("Out", {"test2_out"});
+//   op->SetOutput("Out", {"test2_out"});
+//
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
+//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_out");
+//   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 1ul);
+//   EXPECT_EQ(in_to_outs.size(), 1ul);
-  auto it = in_to_outs.begin();
+//   auto it = in_to_outs.begin();
-  EXPECT_EQ(it->first, "test2_a");
+//   EXPECT_EQ(it->first, "test2_a");
-  EXPECT_EQ(it->second, "test2_out");
+//   EXPECT_EQ(it->second, "test2_out");
-}
+// }
+//
-TEST(InferInplace, SingleGradOpInplaceInToOut) {
+// TEST(InferInplace, SingleGradOpInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("single_op_grad");
+//   op->SetType("single_op_grad");
-  op->SetInput(GradVarName("Out"), {"test2_out"});
+//   op->SetInput(GradVarName("Out"), {"test2_out"});
-  op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+//   op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+//
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_out");
+//   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 1ul);
+//   EXPECT_EQ(in_to_outs.size(), 1ul);
-  auto it = in_to_outs.begin();
+//   auto it = in_to_outs.begin();
-  EXPECT_EQ(it->first, "test2_out");
+//   EXPECT_EQ(it->first, "test2_out");
-  EXPECT_EQ(it->second, "test2_a");
+//   EXPECT_EQ(it->second, "test2_a");
-}
+// }
+//
-TEST(InferInplace, MultiOutInplaceInToOut) {
+// TEST(InferInplace, MultiOutInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("multi_out_op");
+//   op->SetType("multi_out_op");
-  op->SetInput("X", {"a0", "a1"});
+//   op->SetInput("X", {"a0", "a1"});
-  op->SetInput("Y", {"b0"});
+//   op->SetInput("Y", {"b0"});
-  op->SetInput("Z", {"c0", "c1"});
+//   op->SetInput("Z", {"c0", "c1"});
-  op->SetOutput("Out", {"o0"});
+//   op->SetOutput("Out", {"o0"});
-  op->SetOutput("YOut", {"y0"});
+//   op->SetOutput("YOut", {"y0"});
-  op->SetOutput("ZOut", {"z0"});
+//   op->SetOutput("ZOut", {"z0"});
+//
-  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("o0");
+//   prog.MutableBlock(0)->Var("o0");
-  prog.MutableBlock(0)->Var("y0");
+//   prog.MutableBlock(0)->Var("y0");
-  prog.MutableBlock(0)->Var("z0");
+//   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 3ul);
+//   EXPECT_EQ(in_to_outs.size(), 3ul);
-  std::unordered_map<std::string, std::string> expects = {
+//   std::unordered_map<std::string, std::string> expects = {
-      {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
+//       {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
-  };
+//   };
-  EXPECT_TRUE(expects == in_to_outs);
+//   EXPECT_TRUE(expects == in_to_outs);
-}
+// }
+//
-TEST(InferInplace, MultiGradInplaceInToOut) {
+// TEST(InferInplace, MultiGradInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("multi_out_grad");
+//   op->SetType("multi_out_grad");
-  op->SetInput(GradVarName("Out"), {"o0"});
+//   op->SetInput(GradVarName("Out"), {"o0"});
-  op->SetInput(GradVarName("YOut"), {"y0"});
+//   op->SetInput(GradVarName("YOut"), {"y0"});
-  op->SetInput(GradVarName("ZOut"), {"z0"});
+//   op->SetInput(GradVarName("ZOut"), {"z0"});
-  op->SetOutput(GradVarName("X"), {"a0", "a1"});
+//   op->SetOutput(GradVarName("X"), {"a0", "a1"});
-  op->SetOutput(GradVarName("Y"), {"b0"});
+//   op->SetOutput(GradVarName("Y"), {"b0"});
-  op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+//   op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+//
-  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("o0");
+//   prog.MutableBlock(0)->Var("o0");
-  prog.MutableBlock(0)->Var("y0");
+//   prog.MutableBlock(0)->Var("y0");
-  prog.MutableBlock(0)->Var("z0");
+//   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
+//
-  EXPECT_EQ(in_to_outs.size(), 3ul);
+//   EXPECT_EQ(in_to_outs.size(), 3ul);
-  std::unordered_map<std::string, std::string> expects = {
+//   std::unordered_map<std::string, std::string> expects = {
-      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
+//       {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
-  };
+//   };
-  EXPECT_TRUE(expects == in_to_outs);
+//   EXPECT_TRUE(expects == in_to_outs);
-}
+// }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,9 +46,6 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
-pass_library(cpu_quantize_placement_pass base)
-pass_library(cpu_quantize_pass inference)
-pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -71,22 +68,31 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(simplify_anakin_detection_pattern_pass inference)
+pass_library(anakin_fillconstant_elementwisemul_fuse inference)
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
 # be detected by our pass. The index here represents the number of structures in the
 # pattern. We use index 3 ~ 6, because these quantities of structures are
 # common in the models.
-foreach (index RANGE 3 6)
+foreach (index RANGE 2 6)
   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()
+foreach (index RANGE 2 6)
+   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
+endforeach()
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base mkldnn)
    pass_library(depthwise_conv_mkldnn_pass base mkldnn)
    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
+    pass_library(cpu_quantize_placement_pass base mkldnn)
+    pass_library(cpu_quantize_pass inference mkldnn)
+    pass_library(cpu_quantize_squash_pass inference mkldnn)
 endif()
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -105,9 +111,6 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
-cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
-cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if(NOT WIN32)
    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 endif()
@@ -117,4 +120,7 @@ if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
+    cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+    cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 endif ()
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                 \
+  GET_IR_NODE(fill_constant);     \
+  GET_IR_NODE(fill_constant_out); \
+  GET_IR_NODE(elementwise_mul);   \
+  GET_IR_NODE(elementwise_mul_out);
+std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("elementwise_mul", "X")
+                ->AsInput();
+  patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
+                                                         pattern_name);
+  pattern(x);
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* elementwise_in = subgraph.at(x);
+    float constant_value =
+        boost::get<float>(fill_constant->Op()->GetAttr("value"));
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("scale");
+    new_op_desc.SetInput("X", {elementwise_in->Name()});
+    new_op_desc.SetAttr("scale", constant_value);
+    new_op_desc.SetAttr("bias", static_cast<float>(0.0));
+    new_op_desc.SetAttr("bias_after_scale", true);
+    new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto* scale_op = graph->CreateOpNode(&new_op_desc);
+    IR_NODE_LINK_TO(elementwise_in, scale_op);       // Input
+    IR_NODE_LINK_TO(scale_op, elementwise_mul_out);  // Output
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {fill_constant, fill_constant_out, elementwise_mul});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
+              paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class AnakinFillconstantElementwisemulFuse : public FusePassBase {
+ public:
+  virtual ~AnakinFillconstantElementwisemulFuse() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1470,6 +1470,171 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
  return concat_out;
 }
+PDNode *patterns::AnakinDetectionPattern::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
+  const int kNumFields = 7;
+  const int kPriorBoxLocOffset = 1;
+  const int kReshape1Offset = 2;
+  const int kReshape1OutOffset = 3;
+  const int kPriorBoxVarOffset = 4;
+  const int kReshape2Offset = 5;
+  const int kReshape2OutOffset = 6;
+  const int kBoxCoderThirdInputOffset = times;
+  const int kMultiClassSecondInputNmsOffset = times + 1;
+  std::vector<PDNode *> nodes;
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
+            ->assert_is_op("density_prior_box"));
+    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
+                        ->assert_is_op_output("density_prior_box", "Boxes")
+                        ->assert_is_op_input("reshape2", "X")
+                        ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
+            ->assert_is_op_output("density_prior_box", "Variances")
+            ->assert_is_op_input("reshape2", "X")
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+  auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+  auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+  auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
+                          ->assert_is_op("box_coder")
+                          ->assert_op_has_n_inputs("box_coder", 3);
+  auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
+                           ->assert_is_op_output("box_coder")
+                           ->AsIntermediate();
+  auto transpose_before_nms =
+      pattern->NewNode(GetNodeName("transpose_before_nms"))
+          ->assert_is_op("transpose2");
+  auto transpose_before_nms_out =
+      pattern->NewNode(GetNodeName("transpose_before_nms_out"))
+          ->assert_is_op_output("transpose2")
+          ->assert_is_op_input("multiclass_nms", "Scores")
+          ->AsIntermediate();
+  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
+                               ->assert_is_op("multiclass_nms")
+                               ->assert_op_has_n_inputs("multiclass_nms", 2);
+  auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
+                                ->assert_is_op_output("multiclass_nms")
+                                ->AsOutput();
+  std::vector<PDNode *> reshape1_outs;
+  std::vector<PDNode *> reshape2_outs;
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // prior_box
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // prior_box box out
+    nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxLocOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape1Offset]});
+    nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxVarOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape2Offset]});
+    reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
+    reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
+  }
+  concat_op1->LinksFrom(reshape1_outs);
+  concat_op2->LinksFrom(reshape2_outs);
+  concat_out1->LinksFrom({concat_op1});
+  concat_out2->LinksFrom({concat_op2});
+  conv_in[kBoxCoderThirdInputOffset]->AsInput();
+  conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
+  box_coder_op->LinksFrom(
+      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
+  box_coder_out->LinksFrom({box_coder_op});
+  transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
+  transpose_before_nms_out->LinksFrom({transpose_before_nms});
+  multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
+      .LinksTo({multiclass_nms_out});
+  return multiclass_nms_out;
+}
+PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
+    PDNode *elementwise_op_input) {
+  auto fill_constant =
+      pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
+  auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
+                               ->assert_is_op_output("fill_constant")
+                               ->assert_is_op_input("elementwise_mul", "Y")
+                               ->AsIntermediate();
+  auto elementwise_mul_op =
+      pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
+  auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
+                                 ->assert_is_op_output("elementwise_mul")
+                                 ->AsOutput();
+  fill_constant_out->LinksFrom({fill_constant});
+  elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
+  elementwise_mul_out->LinksFrom({elementwise_mul_op});
+  return elementwise_mul_out;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -844,6 +844,36 @@ struct TransposeFlattenConcat : public PatternBase {
  }
 };
+struct AnakinDetectionPattern : public PatternBase {
+  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
+  AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
+                                       const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "anakin_fillconstant_elementwisemul_fuse") {}
+  PDNode* operator()(PDNode* elementwise_op_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(fill_constant);
+  PATTERN_DECL_NODE(fill_constant_out);
+  PATTERN_DECL_NODE(elementwise_mul);
+  PATTERN_DECL_NODE(elementwise_mul_out);
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <string>
 #include <unordered_set>

--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+template <int times>
+std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name =
+      "simplify_anakin_detection_pattern_pass" + std::to_string(times);
+  FusePassBase::Init(pattern_name, graph.get());
+  GraphPatternDetector gpd;
+  std::vector<PDNode *> input_nodes;
+  for (int i = 0; i < times; i++) {
+    input_nodes.push_back(gpd.mutable_pattern()
+                              ->NewNode("x" + std::to_string(i))
+                              ->assert_is_op_input("density_prior_box", "Input")
+                              ->AsInput());
+  }
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times))
+                            ->assert_is_op_input("box_coder", "TargetBox")
+                            ->AsInput());
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times + 1))
+                            ->assert_is_op_input("transpose2")
+                            ->AsInput());
+  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(input_nodes, times);
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    const int kNumFields = 7;
+    const int kPriorBoxLocOffset = 1;
+    const int kReshape1Offset = 2;
+    const int kReshape1OutOffset = 3;
+    const int kPriorBoxVarOffset = 4;
+    const int kReshape2Offset = 5;
+    const int kReshape2OutOffset = 6;
+    std::vector<Node *> nodes;
+    for (int i = 0; i < times; i++) {
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+    }
+    Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1"));
+    Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out"));
+    Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2"));
+    Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out"));
+    Node *box_coder_third_input = subgraph.at(input_nodes[times]);
+    Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder"));
+    Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
+    Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
+    Node *transpose_before_nms =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms"));
+    Node *transpose_before_nms_out =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms_out"));
+    Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
+    Node *multiclass_nms_out =
+        subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
+    std::string code_type =
+        boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
+    bool box_normalized =
+        boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
+    // auto variance =
+    // boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
+    int background_label =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
+    float score_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("score_threshold"));
+    int nms_top_k = boost::get<int>(multiclass_nms->Op()->GetAttr("nms_top_k"));
+    float nms_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("nms_threshold"));
+    float nms_eta = boost::get<float>(multiclass_nms->Op()->GetAttr("nms_eta"));
+    int keep_top_k =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("keep_top_k"));
+    std::vector<std::string> concat1_input_names;
+    for (int i = 0; i < times; i++) {
+      concat1_input_names.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
+    }
+    // int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
+    framework::OpDesc concat1_desc;
+    concat1_desc.SetType("concat");
+    concat1_desc.SetInput("X", concat1_input_names);
+    concat1_desc.SetAttr("axis", 2);
+    concat1_desc.SetOutput("Out", {concat_out1->Name()});
+    auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(
+          new_add_concat_op);
+      new_add_concat_op->inputs.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]);
+    }
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("detection_out");
+    new_op_desc.SetInput("PriorBox", {concat_out1->Name()});
+    new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()});
+    new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()});
+    new_op_desc.SetAttr("code_type", code_type);
+    new_op_desc.SetAttr("box_normalized", box_normalized);
+    new_op_desc.SetAttr("background_label", background_label);
+    new_op_desc.SetAttr("score_threshold", score_threshold);
+    new_op_desc.SetAttr("nms_top_k", nms_top_k);
+    new_op_desc.SetAttr("nms_threshold", nms_threshold);
+    new_op_desc.SetAttr("nms_eta", nms_eta);
+    new_op_desc.SetAttr("keep_top_k", keep_top_k);
+    new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()});
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto *detection_out_op = graph->CreateOpNode(&new_op_desc);
+    std::unordered_set<const Node *> delete_nodes;
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]);
+    }
+    delete_nodes.insert(concat_op1);
+    delete_nodes.insert(concat_op2);
+    delete_nodes.insert(concat_out2);
+    delete_nodes.insert(box_coder_op);
+    delete_nodes.insert(box_coder_out);
+    delete_nodes.insert(transpose_before_nms);
+    delete_nodes.insert(transpose_before_nms_out);
+    delete_nodes.insert(multiclass_nms);
+    new_add_concat_op->outputs.push_back(concat_out1);
+    concat_out1->inputs.push_back(new_add_concat_op);
+    detection_out_op->inputs.push_back(concat_out1);
+    detection_out_op->inputs.push_back(box_coder_third_input);
+    detection_out_op->inputs.push_back(multiclass_nms_second_input);
+    detection_out_op->outputs.push_back(multiclass_nms_out);
+    concat_out1->outputs.push_back(detection_out_op);
+    box_coder_third_input->outputs.push_back(detection_out_op);
+    multiclass_nms_second_input->outputs.push_back(detection_out_op);
+    multiclass_nms_out->inputs.push_back(detection_out_op);
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+template class SimplifyAnakinDetectionPatternPass<1>;
+template class SimplifyAnakinDetectionPatternPass<2>;
+template class SimplifyAnakinDetectionPatternPass<3>;
+template class SimplifyAnakinDetectionPatternPass<4>;
+template class SimplifyAnakinDetectionPatternPass<5>;
+template class SimplifyAnakinDetectionPatternPass<6>;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(simplify_anakin_detection_pattern_pass,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <unordered_set>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// There may be many transpose-flatten structures in a model, and the output of
+// these structures will be used as inputs to the concat Op. This pattern will
+// be detected by our pass. The times here represents the repeat times of this
+// structure.
+template <int times>
+class SimplifyAnakinDetectionPatternPass : public FusePassBase {
+ public:
+  virtual ~SimplifyAnakinDetectionPatternPass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
 }
 template class TransposeFlattenConcatFusePass<1>;
+template class TransposeFlattenConcatFusePass<2>;
 template class TransposeFlattenConcatFusePass<3>;
 template class TransposeFlattenConcatFusePass<4>;
 template class TransposeFlattenConcatFusePass<5>;
@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>;
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
              paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
+REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
 REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
              paddle::framework::ir::TransposeFlattenConcatFusePass<3>);

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (UNLIKELY(!tensor.IsInitialized())) {
+    // if (UNLIKELY(!tensor.IsInitialized())) {
-      return DDim({-1});
+    //   return DDim({-1});
-    }
+    // }
    return tensor.dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (UNLIKELY(!tensor.IsInitialized())) {
+    // if (UNLIKELY(!tensor.IsInitialized())) {
-      return default_lod;
+    //   return default_lod;
-    }
+    // }
    return tensor.lod();
  } else {
    return default_lod;

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -60,7 +60,7 @@ using InferVarTypeFN =
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 using InplacePair = std::unordered_map<std::string, std::string>;
-using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, BlockDesc*)>;
+using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>;
 using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
    const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -315,6 +315,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
      for (size_t i = 0; i < outputs.size(); ++i) {
        framework::Variable* grad = outputs[i]->var_;
        framework::Variable* orig_grad = origin_outputs[i]->var_;
+        VLOG(3) << "AddTo Called with orig_grad is: "
+                << origin_outputs[i]->name_ << " Grad to be added is "
+                << outputs[i]->name_;
        AddTo(grad, orig_grad, place_);
        delete grad;
      }

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -277,6 +277,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
          VarBase* var = current_vars_map[var_it->second];
          InitGrad(var, prepared_op.GetDeviceContext());
          grad_out_vars.push_back(var->grads_);
+          VLOG(3) << "grads output var name: " << var->name_;
        }
      }
    }

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,7 +16,10 @@ add_subdirectory(utils)
 if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()
-# add_subdirectory(anakin)
+if (ANAKIN_FOUND)
+  add_subdirectory(anakin)
+endif()
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)

--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
-cc_library(anakin_engine SRCS engine.cc)
+cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
+cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
 add_subdirectory(convert)
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
-cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
+ elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
+cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
+cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
+cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
+cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
+cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
+cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
+#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
+cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor SERIAL)
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/activation.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
+    : op_type_(op_type) {
+  auto it = anakin_op_types_.find(op_type_);
+  PADDLE_ENFORCE(it != anakin_op_types_.end(),
+                 "activation op type is not support");
+  anakin_op_type_ = it->second;
+}
+void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
+                                       const framework::Scope &scope,
+                                       bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "type", anakin_op_type_);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
--- a/paddle/fluid/inference/anakin/convert/registrar.h
+++ b/paddle/fluid/inference/anakin/convert/registrar.h
@@ -14,45 +14,39 @@
 #pragma once
-#include <functional>
 #include <map>
-#include <memory>
 #include <string>
-#include <utility>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 namespace paddle {
 namespace inference {
 namespace anakin {
-class AnakinOpConverter;
+class ActivationOpConverter : public AnakinOpConverter {
-class OpRegister {
 public:
-  OpRegister() = default;
+  explicit ActivationOpConverter(const std::string &op_type);
-  std::shared_ptr<AnakinOpConverter> Get(const std::string &name);
-  static OpRegister *instance();
+  virtual void operator()(const framework::proto::OpDesc &op,
-  void OpRegisterFn(const std::string &name,
+                          const framework::Scope &scope,
-                    std::function<std::shared_ptr<AnakinOpConverter>()> fn) {
+                          bool test_mode) override;
-    registry_[name] = fn;
+  virtual ~ActivationOpConverter() {}
-  }
 private:
-  using RegisterFnType = std::function<std::shared_ptr<AnakinOpConverter>()>;
+  std::string op_type_;
-  std::map<std::string, std::function<std::shared_ptr<AnakinOpConverter>()>>
+  std::string anakin_op_type_;
-      registry_;
+  std::map<std::string, std::string> anakin_op_types_{{"tanh", "TanH"},
+                                                      {"sigmoid", "Sigmoid"}};
 };
-template <typename T, typename... Args>
+class TanhOpConverter : public ActivationOpConverter {
-class Registrar {
 public:
-  Registrar(const std::string &name, Args... args) {
+  TanhOpConverter() : ActivationOpConverter("tanh") {}
-    std::shared_ptr<AnakinOpConverter> converter =
-        std::make_shared<T>(std::move(args)...);
-    OpRegister::instance()->OpRegisterFn(name,
-                                         [converter]() { return converter; });
-  }
 };
+class SigmoidOpConverter : public ActivationOpConverter {
+ public:
+  SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
+};
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/batch_norm.h"
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::Scope &scope,
+                                      bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
+  std::map<std::string, std::string> inputs;
+  for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) {
+    PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL);
+    auto v = op_desc.Input(k).front();
+    inputs.insert({k, v});
+  }
+  auto output = op_desc.Output("Y").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
+  auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
+  // auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
+  auto bn_op_name = op_name + ":bn";
+  auto bn_output = bn_op_name + "_output";
+  engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
+  engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
+  engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
+  auto scale_op_name = op_name + ":scale";
+  auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
+                                                 framework::LoDTensor *tensor) {
+    auto *v = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(v);
+    auto *t = v->GetMutable<framework::LoDTensor>();
+    tensor->Resize(t->dims());
+    TensorCopySync(*t, platform::CPUPlace(), tensor);
+  };
+  framework::LoDTensor bias_t;
+  framework::LoDTensor mean_t;
+  framework::LoDTensor scale_t;
+  framework::LoDTensor variance_t;
+  get_lod_tensor(inputs["Bias"], &bias_t);
+  get_lod_tensor(inputs["Mean"], &mean_t);
+  get_lod_tensor(inputs["Scale"], &scale_t);
+  get_lod_tensor(inputs["Variance"], &variance_t);
+  auto fill_shape = [](size_t n, std::vector<int> shape) {
+    shape.insert(shape.begin(), 1);
+    if (shape.size() < n) {
+      shape.insert(shape.end(), n - shape.size(), 1);
+    }
+    return shape;
+  };
+  Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
+  Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
+  engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
+  auto *weight2 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape2);
+  auto *variance_data =
+      static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
+  engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
+  Shape shape3(std::vector<int>({1, 1, 1, 1}));
+  auto *weight3 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape3);
+  auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
+  float weight3_data[] = {1};
+  std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
+  engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
+  Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
+  auto *scale =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(scale_shape);
+  auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
+  std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
+  Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
+  auto *bias =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(bias_shape);
+  auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
+  std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
+  engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
+  engine_->AddOpAttr(scale_op_name, "axis", 1);
+  engine_->AddOpAttr(scale_op_name, "num_axes", 1);
+  engine_->AddOpAttr(scale_op_name, "bias_term", true);
+  engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
+  engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter);
--- a/paddle/fluid/inference/anakin/convert/batch_norm.h
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class BatchNormOpConverter : public AnakinOpConverter {
+ public:
+  BatchNormOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~BatchNormOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/concat.h"
+#include <algorithm>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  auto input_names = op_desc.Input("X");
+  // PADDLE_ENFORCE(axis > 0,
+  //               "The axis attr of Concat op should be large than 0 for trt");
+  auto y_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Concat", input_names, {y_name});
+  engine_->AddOpAttr(op_name, "axis", axis);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter);
--- a/paddle/fluid/inference/anakin/convert/concat.h
+++ b/paddle/fluid/inference/anakin/convert/concat.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ConcatOpConverter : public AnakinOpConverter {
+ public:
+  ConcatOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ConcatOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/conv2d.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
+  auto input_name = op_desc.Input("Input").front();
+  auto output_name = op_desc.Output("Output").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
+  engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
+  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(filter_v);
+  auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(filter_t->dims());
+  TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  // const int n_output = weight_tensor->dims()[0];
+  // const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
+  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  engine_->AddOpAttr(op_name, "group", groups);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "bias_term", false);
+  auto weight_shape = framework::vectorize2int(filter_t->dims());
+  Shape anakin_shape(weight_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter);
--- a/paddle/fluid/inference/anakin/convert/registrar.cc
+++ b/paddle/fluid/inference/anakin/convert/registrar.cc
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/anakin/convert/registrar.h"
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 namespace paddle {
 namespace inference {
 namespace anakin {
-std::shared_ptr<AnakinOpConverter> OpRegister::Get(const std::string &name) {
+class Conv2dOpConverter : public AnakinOpConverter {
-  auto it = registry_.find(name);
+ public:
-  if (it == registry_.end()) return nullptr;
+  Conv2dOpConverter() = default;
-  return it->second();
-}
-OpRegister *OpRegister::instance() {
+  virtual void operator()(const framework::proto::OpDesc &op,
-  static OpRegister factory;
+                          const framework::Scope &scope,
-  return &factory;
+                          bool test_mode) override;
-}
+  virtual ~Conv2dOpConverter() {}
+};
 }  // namespace anakin
 }  // namespace inference

--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
+  auto input_name = op_desc.Input("Input").front();
+  auto output_name = op_desc.Output("Output").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
+  engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
+  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(filter_v);
+  auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
+  auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+  PADDLE_ENFORCE_NOT_NULL(b_v);
+  auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(filter_t->dims());
+  TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  // const int n_output = weight_tensor->dims()[0];
+  // const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
+  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  engine_->AddOpAttr(op_name, "group", groups);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "bias_term", true);
+  auto weight_shape = framework::vectorize2int(filter_t->dims());
+  Shape anakin_shape(weight_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  auto bias_shape = framework::vectorize2int(b_t->dims());
+  framework::LoDTensor bias_tensor;
+  bias_tensor.Resize(b_t->dims());
+  TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+  auto *bias_data = bias_tensor.data<float>();
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  // bias_shape.push_back(1);
+  // bias_shape.push_back(1);
+  Shape anakin_bias_shape(bias_shape);
+  auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+      anakin_bias_shape);
+  float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+  weight2->d_tensor().set_shape(anakin_bias_shape);
+  weight2->d_tensor().copy_from(weight2->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_2", *weight2);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class Conv2dFusionOpConverter : public AnakinOpConverter {
+ public:
+  Conv2dFusionOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Conv2dFusionOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/density_prior_box.h"
+#include <algorithm>
+#include <map>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
+                                            const framework::Scope& scope,
+                                            bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto input_name = op_desc.Input("Input").front();
+  auto image_name = op_desc.Input("Image").front();
+  auto output_name = op_desc.Output("Boxes").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
+  auto fixed_sizes =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+  auto fixed_ratios =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+  std::vector<float> dens;
+  for (auto& ele : densities) {
+    dens.push_back(static_cast<float>(ele));
+  }
+  // lack flip
+  // auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
+  for (auto& ele : variances) {
+    LOG(INFO) << ele;
+  }
+  // lack img_h, img_w
+  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
+  auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
+  auto offset = boost::get<float>(op_desc.GetAttr("offset"));
+  PTuple<std::string> t_order;
+  t_order.push_back("MIN");
+  t_order.push_back("COM");
+  t_order.push_back("MAX");
+  std::vector<float> temp_v = {};
+  engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
+  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
+  engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
+  engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
+  engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "step_h", step_h);
+  engine_->AddOpAttr(op_name, "step_w", step_w);
+  engine_->AddOpAttr(op_name, "offset", offset);
+  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class DensityPriorBoxOpConverter : public AnakinOpConverter {
+ public:
+  DensityPriorBoxOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DensityPriorBoxOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/detection_out.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto target_name = op_desc.Input("TargetBox").front();
+  auto prior_box_name = op_desc.Input("PriorBox").front();
+  auto scores_name = op_desc.Input("Scores").front();
+  auto output_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
+  auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
+  auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
+  auto nms_top_k = boost::get<int>(op_desc.GetAttr("nms_top_k"));
+  auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
+  auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
+  auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
+  std::string anakin_code_type;
+  if (code_type == "decode_center_size") {
+    anakin_code_type = "CENTER_SIZE";
+  } else if (code_type == "encode_center_size") {
+    PADDLE_THROW(
+        "Not support encode_center_size code_type in DetectionOut of anakin");
+  }
+  engine_->AddOp(op_name, "DetectionOutput",
+                 {target_name, scores_name, prior_box_name}, {output_name});
+  engine_->AddOpAttr(op_name, "share_location", true);
+  engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
+  engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "background_id", background_label);
+  engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
+  engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
+  engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
+  engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
+  engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
+  engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class DetectionOutOpConverter : public AnakinOpConverter {
+ public:
+  DetectionOutOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DetectionOutOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/dropout.cc
+++ b/paddle/fluid/inference/anakin/convert/dropout.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/dropout.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
+  auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
+  auto factor = 1 - dropout_prob;
+  Shape shape1(std::vector<int>({1, 1, 1, 1}));
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  float weight1_data[] = {factor};
+  std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data);
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  engine_->AddOpAttr(op_name, "axis", 0);
+  engine_->AddOpAttr(op_name, "num_axes", 0);
+  engine_->AddOpAttr(op_name, "bias_term", false);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter);
--- a/paddle/fluid/inference/anakin/convert/dropout.h
+++ b/paddle/fluid/inference/anakin/convert/dropout.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class DropoutOpConverter : public AnakinOpConverter {
+ public:
+  DropoutOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DropoutOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/elementwise.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
+                                           const framework::Scope &scope,
+                                           bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Input("Y").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
+  std::string elementwise_type = "Add";
+  engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
+  std::vector<float> coeff = {1.0, 1.0};
+  engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
+}
+void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op,
+                                           const framework::Scope &scope,
+                                           bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Input("Y").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name});
+  // Fill a number to weight_1 as a placeholder.
+  Shape shape1(std::vector<int>({1, 1, 1, 1}));
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *placeholder_data =
+      static_cast<float *>(weight1->h_tensor().mutable_data());
+  float weight1_data[] = {1};
+  std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data);
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  auto axis = boost::get<int>(op_desc.GetAttr("axis"));
+  engine_->AddOpAttr(op_name, "axis", axis);
+  engine_->AddOpAttr(op_name, "num_axes", 1);
+  engine_->AddOpAttr(op_name, "bias_term", false);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter);
--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ElementwiseAddOpConverter : public AnakinOpConverter {
+ public:
+  ElementwiseAddOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ElementwiseAddOpConverter() {}
+ private:
+};
+class ElementwiseMulOpConverter : public AnakinOpConverter {
+ public:
+  ElementwiseMulOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ElementwiseMulOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -14,60 +14,108 @@
 #include "paddle/fluid/inference/anakin/convert/fc.h"
 #include <algorithm>
+#include <string>
+#include <vector>
 using anakin::graph::GraphGlobalMem;
 using anakin::AK_FLOAT;
-using anakin::Precision;
 using anakin::saber::NV;
-using anakin::saber::X86;
 using anakin::saber::Shape;
-using anakin::PBlock;
-using anakin::PTuple;
 namespace paddle {
 namespace inference {
 namespace anakin {
-void FcOpConverter::operator()(const framework::proto::OpDesc &op,
+void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
-                               const framework::Scope &scope, bool test_mode) {
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  auto input_names = op_desc.InputNames();
-  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  bool with_bias = input_names.size() == 3;
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  std::string w_name = "Y";
+  std::string i_name = "X";
+  if (with_bias) {
+    w_name = "W";
+    i_name = "Input";
+  }
-  auto x_name = op_desc.Input("X").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto *y_v = scope.FindVar(op_desc.Input("Y").front());
+  // get weights
+  auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
  PADDLE_ENFORCE_NOT_NULL(y_v);
  auto *y_t = y_v->GetMutable<framework::LoDTensor>();
-  auto input_name = op_desc.Input("X").front();
+  auto input_name = op_desc.Input(i_name).front();
  auto output_name = op_desc.Output("Out").front();
-  auto weight_shape = framework::vectorize2int(y_t->dims());
  engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
-  engine_->AddOpAttr(op_name, "bias_term", false);
+  engine_->AddOpAttr(op_name, "bias_term", with_bias);
  engine_->AddOpAttr(op_name, "axis", 1);
+  auto weight_shape = framework::vectorize2int(y_t->dims());
  int out_dim = weight_shape[1];
  engine_->AddOpAttr(op_name, "out_dim", out_dim);
+  const int w_m = weight_shape[0];
+  const int w_k = weight_shape[1];
-  weight_shape.push_back(1);
+  if (weight_shape.size() < 4UL) {
-  weight_shape.push_back(1);
+    weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1);
+  }
  Shape anakin_shape(weight_shape);
  framework::LoDTensor weight_tensor;
  weight_tensor.Resize(y_t->dims());
  TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
+  auto *weight_data = weight_tensor.data<float>();
+  PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel());
+  std::vector<float> trans_weight_data(weight_tensor.numel());
+  for (int i = 0; i < w_m; i++) {
+    for (int j = 0; j < w_k; j++) {
+      trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
+    }
+  }
  auto *weight1 =
      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
-  std::copy_n(weight_tensor.data<float>(), weight_tensor.numel(), cpu_data);
+  std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
  weight1->d_tensor().set_shape(anakin_shape);
  weight1->d_tensor().copy_from(weight1->h_tensor());
  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  // get bias
+  if (with_bias) {
+    auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+    PADDLE_ENFORCE_NOT_NULL(b_v);
+    auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+    auto bias_shape = framework::vectorize2int(b_t->dims());
+    framework::LoDTensor bias_tensor;
+    bias_tensor.Resize(b_t->dims());
+    TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+    auto *bias_data = bias_tensor.data<float>();
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    // bias_shape.push_back(1);
+    // bias_shape.push_back(1);
+    Shape anakin_bias_shape(bias_shape);
+    auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+        anakin_bias_shape);
+    float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+    std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+    weight2->d_tensor().set_shape(anakin_bias_shape);
+    weight2->d_tensor().copy_from(weight2->h_tensor());
+    engine_->AddOpAttr(op_name, "weight_2", *weight2);
+  }
 }
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@@ -20,19 +20,28 @@ namespace paddle {
 namespace inference {
 namespace anakin {
-class FcOpConverter : public AnakinOpConverter {
+class FcBaseOpConverter : public AnakinOpConverter {
 public:
-  FcOpConverter() = default;
+  FcBaseOpConverter() = default;
  virtual void operator()(const framework::proto::OpDesc &op,
                          const framework::Scope &scope,
                          bool test_mode) override;
-  virtual ~FcOpConverter() {}
+  virtual ~FcBaseOpConverter() {}
+};
- private:
+// with bias
+class FcOpConverter : public FcBaseOpConverter {
+ public:
+  FcOpConverter() = default;
+};
+// without bias
+class MulOpConverter : public FcBaseOpConverter {
+ public:
+  MulOpConverter() = default;
 };
-static Registrar<FcOpConverter> register_fc_op_converter("fc");
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/flatten.h"
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  PADDLE_ENFORCE(axis == 1,
+                 "the anakin flatten op converter now only support aixs == 1.");
+  std::vector<int> out_dims = {0, -1, 1, 1};
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Reshape", {input}, {output});
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
--- a/paddle/fluid/inference/anakin/convert/flatten.h
+++ b/paddle/fluid/inference/anakin/convert/flatten.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class FlattenOpConverter : public AnakinOpConverter {
+ public:
+  FlattenOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~FlattenOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/im2sequence.cc
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::Scope &scope,
+                                      bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
+  std::vector<int> dilations = {1, 1};
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "window_size", kernels);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilations", dilations);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter);
--- a/paddle/fluid/inference/anakin/convert/im2sequence.h
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class Im2SequenceConverter : public AnakinOpConverter {
+ public:
+  Im2SequenceConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Im2SequenceConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -14,15 +14,16 @@
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 #include "framework/core/types.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/anakin/convert/registrar.h"
 #include "paddle/fluid/inference/anakin/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "saber/saber_types.h"
@@ -46,19 +47,14 @@ class AnakinOpConverter {
                 bool test_mode = false) {
    framework::OpDesc op_desc(op, nullptr);
    std::string op_type = op_desc.Type();
-    std::shared_ptr<AnakinOpConverter> it{nullptr};
+    AnakinOpConverter *it = nullptr;
-    if (op_type == "mul") {
+    if (op_type == "reshape2") op_type = "reshape";
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+    if (op_type == "transpose2") op_type = "transpose";
-      std::string Y = op_desc.Input("Y")[0];
+    if (op_type == "flatten2") op_type = "flatten";
-      std::cout << Y << parameters.count(Y) << std::endl;
-      if (parameters.count(Y)) {
-        it = OpRegister::instance()->Get("fc");
-      }
-    }
    if (!it) {
-      it = OpRegister::instance()->Get(op_type);
+      it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
    }
    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
    it->SetEngine(engine);
@@ -74,6 +70,63 @@ class AnakinOpConverter {
      ConvertOp(op, parameters, scope, engine);
    }
  }
+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToAnakinEngine(
+      framework::BlockDesc *block_desc, framework::Scope *scope,
+      const std::vector<std::string> &inputs,
+      const std::unordered_set<std::string> &parameters,
+      const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
+    framework::proto::BlockDesc *block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, *scope, engine);
+    engine->Freeze();
+    // if the max_batch size
+    int max_batch_size = engine->GetMaxBatchSize();
+    PADDLE_ENFORCE(max_batch_size > 0,
+                   "the max_batch_size setted from config->EnableAnakinEngine "
+                   "must largger than 0");
+    // If the user does not specify this variable, we use the input shape from
+    // the block_desc.
+    auto max_input_shape = engine->GetMaxInputShape();
+    std::map<std::string, std::vector<int>> temp_max_input_shape;
+    for (auto &input : inputs) {
+      if (parameters.count(input)) continue;
+      std::vector<int> input_shape;
+      input_shape.resize(4);
+      input_shape[0] = max_batch_size;
+      if (max_input_shape.count(input)) {
+        PADDLE_ENFORCE(max_input_shape[input].size() == 4,
+                       "the dimensions of  max_input_shape setted from "
+                       "config->EnableAnakinEngine must be 4");
+        for (int i = 1; i < 4; i++) {
+          input_shape[i] = max_input_shape[input][i];
+        }
+      } else {
+        auto *var = block_desc->FindVar(input);
+        PADDLE_ENFORCE(var, "no variable called %s", input);
+        auto var_shape = var->GetShape();
+        std::cout << "input :" << input << std::endl;
+        PADDLE_ENFORCE(var_shape.size() == 4);
+        for (size_t i = 1; i < var_shape.size(); i++) {
+          input_shape[i] = var_shape[i];
+        }
+      }
+      temp_max_input_shape[input] = input_shape;
+      engine->SetInputShape(input, input_shape);
+      engine->Graph()->RegistVar(input);  // For share from data.
+    }
+    engine->SetMaxInputShape(temp_max_input_shape);
+    engine->Optimize();
+    // For anakin share with fluid tensor.
+    engine->AllocTmpMem();
+    engine->InitGraph();
+  }
  void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
  virtual ~AnakinOpConverter() {}
@@ -91,22 +144,23 @@ class AnakinOpConverter {
 }  // namespace inference
 }  // namespace paddle
-#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)                \
+#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)               \
-  struct anakin_##op_type__##_converter                                     \
+  struct anakin_##op_type__##_converter                                    \
-      : public ::paddle::framework::Registrar {                             \
+      : public ::paddle::framework::Registrar {                            \
-    anakin_##op_type__##_converter() {                                      \
+    anakin_##op_type__##_converter() {                                     \
-      ::paddle::inference::                                                 \
+      LOG(INFO) << "register convert " << #op_type__;                      \
-          Registry<paddle::inference::anakin::AnakinOpConverter>::Register< \
+      ::paddle::inference::Registry<                                       \
-              ::paddle::inference::anakin::Converter__>(#op_type__);        \
+          ::paddle::inference::anakin::AnakinOpConverter>::Global()        \
-    }                                                                       \
+          .Register<::paddle::inference::anakin::Converter__>(#op_type__); \
-  };                                                                        \
+    }                                                                      \
-  anakin_##op_type__##_converter anakin_##op_type__##_converter__;          \
+  };                                                                       \
-  int TouchConverterRegister_anakin_##op_type__() {                         \
+  anakin_##op_type__##_converter anakin_##op_type__##_converter__;         \
-    anakin_##op_type__##_converter__.Touch();                               \
+  int TouchConverterRegister_anakin_##op_type__() {                        \
-    return 0;                                                               \
+    anakin_##op_type__##_converter__.Touch();                              \
+    return 0;                                                              \
  }
-#define USE_ANAKIN_CONVERTER(op_type__)                                    \
+#define USE_ANAKIN_CONVERTER(op_type__)                             \
-  extern int TouchConverterRegister_anakin_##op_type__();                  \
+  extern int TouchConverterRegister_anakin_##op_type__();           \
-  static int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
+  int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
      TouchConverterRegister_anakin_##op_type__();
--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/pool2d.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
+  std::string pool_type =
+      boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+  std::vector<int> ksize =
+      boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+  std::vector<int> strides =
+      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  std::vector<int> paddings =
+      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
+  std::string anakin_pool_type;
+  if (pool_type == "max") {
+    anakin_pool_type = "MAX";
+  } else if (pool_type == "avg") {
+    if (paddings[0] || paddings[1]) {
+      anakin_pool_type = "AVGEXC";
+    } else {
+      anakin_pool_type = "AVG";
+    }
+  } else {
+    PADDLE_THROW("TensorRT unsupported pooling type!");
+  }
+  engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
+  engine_->AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  engine_->AddOpAttr(op_name, "method", anakin_pool_type);
+  engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
+  engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter);
--- a/paddle/fluid/inference/anakin/convert/pool2d.h
+++ b/paddle/fluid/inference/anakin/convert/pool2d.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class Pool2dOpConverter : public AnakinOpConverter {
+ public:
+  Pool2dOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Pool2dOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
+                                 const framework::Scope &scope,
+                                 bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "alpha", 0);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ b/paddle/fluid/inference/anakin/convert/relu.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ReluOpConverter : public AnakinOpConverter {
+ public:
+  ReluOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ReluOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/reshape.h"
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Reshape", {input}, {output});
+  auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape"));
+  if (shape.size() < 4) {
+    shape.insert(shape.end(), 4 - shape.size(), 1);
+  }
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", shape);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
--- a/paddle/fluid/inference/anakin/convert/reshape.h
+++ b/paddle/fluid/inference/anakin/convert/reshape.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ReshapeOpConverter : public AnakinOpConverter {
+ public:
+  ReshapeOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ReshapeOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/scale.cc
+++ b/paddle/fluid/inference/anakin/convert/scale.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/scale.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::Scope &scope,
+                                  bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  float scale = boost::get<float>(op_desc.GetAttr("scale"));
+  float bias = boost::get<float>(op_desc.GetAttr("bias"));
+  float bias_after_scale =
+      boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
+  PADDLE_ENFORCE(bias_after_scale,
+                 "The anakin scale layer only support bias after scale now.");
+  engine_->AddOp(op_name, "Power", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "shift", bias);
+  engine_->AddOpAttr(op_name, "scale", scale);
+  engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
--- a/paddle/fluid/inference/anakin/convert/scale.h
+++ b/paddle/fluid/inference/anakin/convert/scale.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ScaleOpConverter : public AnakinOpConverter {
+ public:
+  ScaleOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ScaleOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/softmax.h"
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Softmax", {input}, {output});
+  engine_->AddOpAttr(op_name, "axis", 2);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter);
--- a/paddle/fluid/inference/anakin/convert/softmax.h
+++ b/paddle/fluid/inference/anakin/convert/softmax.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class SoftMaxOpConverter : public AnakinOpConverter {
+ public:
+  SoftMaxOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~SoftMaxOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/split.cc
+++ b/paddle/fluid/inference/anakin/convert/split.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/split.h"
+#include <algorithm>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::Scope &scope,
+                                  bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto input_name = op_desc.Input("X").front();
+  auto y_names = op_desc.Output("Out");
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  std::vector<int> output_lengths =
+      boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
+  int split_num = output_lengths.size();
+  PADDLE_ENFORCE(split_num > 1,
+                 "anakin split op converter: the split num should > 1");
+  int num_sum = 0;
+  std::vector<int> slice_point;
+  for (int i = 0; i < split_num - 1; i++) {
+    num_sum += output_lengths[i];
+    slice_point.push_back(num_sum);
+  }
+  engine_->AddOp(op_name, "Slice", {input_name}, y_names);
+  engine_->AddOpAttr(op_name, "axis", axis);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "slice_point", slice_point);
+  // slice_dim is useless in anakin
+  engine_->AddOpAttr(op_name, "slice_dim", 4);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter);
--- a/paddle/fluid/inference/anakin/convert/split.h
+++ b/paddle/fluid/inference/anakin/convert/split.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class SplitOpConverter : public AnakinOpConverter {
+ public:
+  SplitOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~SplitOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/sum.cc
+++ b/paddle/fluid/inference/anakin/convert/sum.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/sum.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void SumOpConverter::operator()(const framework::proto::OpDesc &op,
+                                const framework::Scope &scope, bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto input_names = op_desc.Input("X");
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  std::vector<float> coeff = {1, 1};
+  std::string elementwise_type = "Add";
+  engine_->AddOp(op_name, "Eltwise", input_names, {out_name});
+  engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
+  engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter);
--- a/paddle/fluid/inference/anakin/convert/sum.h
+++ b/paddle/fluid/inference/anakin/convert/sum.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class SumOpConverter : public AnakinOpConverter {
+ public:
+  SumOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~SumOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/activation.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+static void test_activation_op(const std::string &op_type) {
+  auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
+  PADDLE_ENFORCE(converter != nullptr);
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("act-X", {10, 6, 1, 1});
+  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
+  framework::OpDesc desc;
+  desc.SetType(op_type);
+  desc.SetInput("X", {"act-X"});
+  desc.SetOutput("Out", {"act-Out"});
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(5);
+}
+TEST(sigm_op, test) { test_activation_op("sigmoid"); }
+TEST(tanh_op, test) { test_activation_op("tanh"); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(sigmoid);
+USE_OP(tanh);
+USE_ANAKIN_CONVERTER(sigmoid);
+USE_ANAKIN_CONVERTER(tanh);
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(batch_norm_op, test) {
+  std::unordered_set<std::string> parameters(
+      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
+       "batch_norm_variance"});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  std::vector<int> param_shape{2};
+  validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
+  validator.DeclParamVar("batch_norm_scale", param_shape);
+  validator.DeclParamVar("batch_norm_bias", param_shape);
+  validator.DeclParamVar("batch_norm_mean", param_shape);
+  validator.DeclParamVar("batch_norm_variance", param_shape);
+  validator.DeclOutputVar("batch_norm_Y", {1, 2, 5, 5});
+  validator.DeclOutputVar("batch_norm_save_mean", param_shape);
+  validator.DeclOutputVar("batch_norm_save_variance", param_shape);
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"batch_norm_X"});
+  desc.SetInput("Scale", {"batch_norm_scale"});
+  desc.SetInput("Bias", {"batch_norm_bias"});
+  desc.SetInput("Mean", {"batch_norm_mean"});
+  desc.SetInput("Variance", {"batch_norm_variance"});
+  desc.SetOutput("Y", {"batch_norm_Y"});
+  desc.SetOutput("MeanOut", {"batch_norm_mean"});
+  desc.SetOutput("VarianceOut", {"batch_norm_variance"});
+  desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
+  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
+  float eps = 1e-5f;
+  bool is_test = true;
+  desc.SetAttr("epsilon", eps);
+  desc.SetAttr("is_test", is_test);
+  validator.SetOp(*desc.Proto());
+  std::unordered_set<std::string> neglected_output = {
+      "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
+      "batch_norm_variance"};
+  validator.Execute(1, neglected_output);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(batch_norm);
+USE_ANAKIN_CONVERTER(batch_norm);
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/concat.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(concat_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
+  validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
+  validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
+  validator.DeclOutputVar("concat_out", {1, 6, 1, 1});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(concat_op, test2) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("concat_x1", {1, 4});
+  validator.DeclInputVar("concat_x2", {3, 4});
+  validator.DeclInputVar("concat_x3", {2, 4});
+  validator.DeclOutputVar("concat_out", {6, 4});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+  int axis = 0;
+  desc.SetAttr("axis", axis);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(concat);
+USE_ANAKIN_CONVERTER(concat);
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/conv2d.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(conv2d_op, test) {
+  auto* conv2d_converter =
+      Registry<AnakinOpConverter>::Global().Lookup("conv2d");
+  ASSERT_TRUE(conv2d_converter != nullptr);
+  std::unordered_set<std::string> parameters({"conv2d-Y"});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
+  validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
+  validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d");
+  desc.SetInput("Input", {"conv2d-X"});
+  desc.SetInput("Filter", {"conv2d-Y"});
+  desc.SetOutput("Output", {"conv2d-Out"});
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({0, 0});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(3);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(conv2d);
+USE_ANAKIN_CONVERTER(conv2d);
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/dropout.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(dropout_op, native) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("x", {1, 1, 2, 2});
+  validator.DeclOutputVar("out", {1, 1, 2, 2});
+  validator.DeclOutputVar("mask", {1, 1, 2, 2});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("dropout");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"out"});
+  desc.SetOutput("Mask", {"mask"});
+  float dropout_prob = 0.5;
+  desc.SetAttr("dropout_prob", dropout_prob);
+  desc.SetAttr("is_test", true);
+  validator.SetOp(*desc.Proto());
+  std::unordered_set<std::string> neglected_output = {"mask"};
+  validator.Execute(1, neglected_output);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(dropout);
+USE_ANAKIN_CONVERTER(dropout);
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/elementwise.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+static void test_elementwise_op(const std::string &op_type) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("x", {1, 1, 2, 2});
+  validator.DeclInputVar("y", {1, 1, 2, 2});
+  validator.DeclOutputVar("out", {1, 1, 2, 2});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType(op_type);
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Y", {"y"});
+  desc.SetOutput("Out", {"out"});
+  int axis = -1;
+  desc.SetAttr("axis", axis);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(elementwise_op, native_add) { test_elementwise_op("elementwise_add"); }
+TEST(elementwise_op, native_mul) { test_elementwise_op("elementwise_mul"); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(elementwise_add);
+USE_ANAKIN_CONVERTER(elementwise_add);
+USE_OP(elementwise_mul);
+USE_ANAKIN_CONVERTER(elementwise_mul);
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
-#include "paddle/fluid/inference/anakin/convert/fc.h"
 #include "paddle/fluid/inference/anakin/convert/op_converter.h"
 #include "paddle/fluid/inference/anakin/convert/ut_helper.h"
@@ -22,17 +21,15 @@ namespace inference {
 namespace anakin {
 TEST(fc_op, test) {
-  auto fc_converter = OpRegister::instance()->Get("fc");
+  auto* fc_converter = Registry<AnakinOpConverter>::Global().Lookup("fc");
-  ASSERT_TRUE(fc_converter != nullptr);
+  ASSERT_TRUE(fc_converter);
-  // Registrar<FcOpConverter> register_fc("fc");
-  // auto fc = std::make_shared<FcOpConverter>();
  std::unordered_set<std::string> parameters({"mul_y"});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
-  validator.DeclInputVar("mul_x", {1, 1, 1, 1});
+  validator.DeclInputVar("mul_x", {1, 1, 2, 2});
-  validator.DeclParamVar("mul_y", {1, 2});
+  validator.DeclParamVar("mul_y", {4, 2});
-  validator.DeclOutputVar("mul_out", {1, 1, 1, 2});
+  validator.DeclOutputVar("mul_out", {1, 2});
  // Prepare Op description
  framework::OpDesc desc;
@@ -40,8 +37,6 @@ TEST(fc_op, test) {
  desc.SetInput("X", {"mul_x"});
  desc.SetInput("Y", {"mul_y"});
  desc.SetOutput("Out", {"mul_out"});
-  int num_flatten_dims = 3;
-  desc.SetAttr("x_num_col_dims", num_flatten_dims);
  validator.SetOp(*desc.Proto());
  validator.Execute(10);
@@ -52,3 +47,4 @@ TEST(fc_op, test) {
 }  // namespace paddle
 USE_OP(mul);
+USE_ANAKIN_CONVERTER(fc);
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(flatten_op, test) {
+  auto *converter = Registry<AnakinOpConverter>::Global().Lookup("flatten");
+  ASSERT_TRUE(converter);
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
+  validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
+  framework::OpDesc desc;
+  desc.SetType("flatten");
+  desc.SetInput("X", {"flatten-X"});
+  desc.SetOutput("Out", {"flatten-Out"});
+  desc.SetAttr("axis", 1);
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(5);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(reshape);
+USE_OP_ITSELF(flatten);
+USE_ANAKIN_CONVERTER(flatten);
--- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(im2sequence_op, native) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  std::vector<int> kernels = {6, 1};
+  std::vector<int> strides = {1, 1};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  validator.DeclInputVar("x", {1, 1, 2, 2});
+  validator.DeclOutputVar("out", {1, 1 * kernels[0] * kernels[1]});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("im2sequence");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"out"});
+  desc.SetAttr("kernels", kernels);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(im2sequence);
+USE_ANAKIN_CONVERTER(im2sequence);
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+void test_pool2d(bool global_pooling, bool ceil_mode,
+                 std::string pool_type = "max") {
+  auto* pool2d_converter =
+      Registry<AnakinOpConverter>::Global().Lookup("pool2d");
+  ASSERT_TRUE(pool2d_converter);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, &scope);
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d_x", {1, 3, 6, 7});
+  if (global_pooling)
+    validator.DeclOutputVar("pool2d_out", {1, 3, 1, 1});
+  else if (ceil_mode)
+    validator.DeclOutputVar("pool2d_out", {1, 3, 3, 4});
+  else
+    validator.DeclOutputVar("pool2d_out", {1, 3, 3, 3});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d_x"});
+  desc.SetOutput("Out", {"pool2d_out"});
+  std::vector<int> ksize({2, 2});
+  std::vector<int> strides({2, 2});
+  std::vector<int> paddings({0, 0});
+  std::string pooling_t = pool_type;
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
+  desc.SetAttr("ceil_mode", ceil_mode);
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+void test_pool2d2(bool global_pooling, bool ceil_mode,
+                  std::string pool_type = "max") {
+  auto* pool2d_converter =
+      Registry<AnakinOpConverter>::Global().Lookup("pool2d");
+  ASSERT_TRUE(pool2d_converter);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, &scope);
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d_x", {1, 1, 17, 17});
+  validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d_x"});
+  desc.SetOutput("Out", {"pool2d_out"});
+  std::vector<int> ksize({3, 3});
+  std::vector<int> strides({1, 1});
+  std::vector<int> paddings({1, 1});
+  std::string pooling_t = pool_type;
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
+  desc.SetAttr("ceil_mode", true);
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
+TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
+TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
+TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(pool2d);
+USE_ANAKIN_CONVERTER(pool2d);
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+static void test_activation_op(const std::string &op_type) {
+  auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
+  PADDLE_ENFORCE(converter != nullptr);
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("act-X", {10, 6, 1, 1});
+  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
+  framework::OpDesc desc;
+  desc.SetType(op_type);
+  desc.SetInput("X", {"act-X"});
+  desc.SetOutput("Out", {"act-Out"});
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(5);
+}
+TEST(sigm_op, test) { test_activation_op("relu"); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(relu);
+USE_ANAKIN_CONVERTER(relu);
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(reshape, test) {
+  auto* converter = Registry<AnakinOpConverter>::Global().Lookup("reshape");
+  ASSERT_TRUE(converter);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, &scope);
+  // validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
+  // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
+  validator.DeclInputVar("reshape-X", {1, 2, 4, 1});
+  validator.DeclOutputVar("reshape-Out", {1, 8, 1, 1});
+  framework::OpDesc desc;
+  desc.SetType("reshape");
+  desc.SetInput("X", {"reshape-X"});
+  desc.SetOutput("Out", {"reshape-Out"});
+  // desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
+  desc.SetAttr("shape", std::vector<int>({1, 8, 1, 1}));
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+TEST(reshape, test2) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("reshape-X", {1, 2, 4});
+  validator.DeclOutputVar("reshape-Out", {1, 4, 2});
+  framework::OpDesc desc;
+  desc.SetType("reshape");
+  desc.SetInput("X", {"reshape-X"});
+  desc.SetOutput("Out", {"reshape-Out"});
+  // desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
+  desc.SetAttr("shape", std::vector<int>({0, -1, 2}));
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(reshape);
+USE_ANAKIN_CONVERTER(reshape);
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(softmax, test) {
+  auto* converter = Registry<AnakinOpConverter>::Global().Lookup("softmax");
+  ASSERT_TRUE(converter);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("softmax-X", {1, 10, 2});
+  validator.DeclOutputVar("softmax-Out", {1, 10, 2});
+  framework::OpDesc desc;
+  desc.SetType("softmax");
+  desc.SetInput("X", {"softmax-X"});
+  desc.SetOutput("Out", {"softmax-Out"});
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(softmax);
+USE_ANAKIN_CONVERTER(softmax);
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/split.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+template <int Axis>
+void AnakinSliceTest(const std::vector<int> &in_shape,
+                     const std::vector<int> &sections) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("split_input", in_shape);
+  std::vector<std::string> output_vars;
+  for (size_t i = 0; i < sections.size(); ++i) {
+    auto out_shape = in_shape;
+    out_shape[Axis] = sections[i];
+    std::string output_name = "split_out" + std::to_string(i);
+    validator.DeclOutputVar(output_name, out_shape);
+    output_vars.push_back(output_name);
+  }
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("split");
+  desc.SetInput("X", {"split_input"});
+  desc.SetOutput("Out", output_vars);
+  desc.SetAttr("axis", Axis);
+  desc.SetAttr("num", 0);
+  desc.SetAttr("sections", sections);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+// batch = 0, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch1) {
+  AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
+}
+// batch = 0, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch1) {
+  AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
+}
+// batch = 10, axis = 1, same shape
+TEST(split_op, test_same_shape_axis1_batch10) {
+  AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
+}
+// batch = 10, axis = 1, different shape
+TEST(split_op, test_different_shape_axis1_batch10) {
+  AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
+}
+// batch = 0, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch1) {
+  AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
+}
+// batch = 0, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch1) {
+  AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
+}
+// batch = 10, axis = 2, same shape
+TEST(split_op, test_same_shape_axis2_batch10) {
+  AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
+}
+// batch = 10, axis = 2, different shape
+TEST(split_op, test_different_shape_axis2_batch10) {
+  AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
+}
+// batch = 0, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch1) {
+  AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2});
+}
+// batch = 0, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch1) {
+  AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1});
+}
+// batch = 10, axis = 3, same shape
+TEST(split_op, test_same_shape_axis3_batch10) {
+  AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2});
+}
+// batch = 10, axis = 3, different shape
+TEST(split_op, test_different_shape_axis3_batch10) {
+  AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1});
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(split);
+USE_ANAKIN_CONVERTER(split);
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/sum.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+#include "paddle/fluid/operators/sum_op.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(sum, native) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
+  validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
+  validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("sum");
+  desc.SetInput("X", {"sum_x1", "sum_x2"});
+  desc.SetOutput("Out", {"sum_out"});
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(sum);
+USE_ANAKIN_CONVERTER(sum);
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+TEST(transpose_op, test) {
+  auto* converter = Registry<AnakinOpConverter>::Global().Lookup("transpose");
+  ASSERT_TRUE(converter != nullptr);
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
+  validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("transpose");
+  desc.SetInput("X", {"transpose-X"});
+  desc.SetOutput("Out", {"transpose-Out"});
+  desc.SetAttr("axis", std::vector<int>({2, 0, 3, 1}));
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(3);
+}
+// test input shape's dims < 4
+TEST(transpose_op, test2) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, &scope);
+  validator.DeclInputVar("transpose-X", {3, 4, 5});
+  validator.DeclOutputVar("transpose-Out", {3, 5, 4});
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("transpose");
+  desc.SetInput("X", {"transpose-X"});
+  desc.SetOutput("Out", {"transpose-Out"});
+  desc.SetAttr("axis", std::vector<int>({0, 2, 1}));
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+USE_OP(transpose);
+USE_ANAKIN_CONVERTER(transpose);
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/transpose.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::Scope &scope,
+                                      bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Permute", {input}, {output});
+  auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
+  size_t axis_size = axis.size();
+  while (axis.size() < 4) {
+    axis.push_back(axis_size);
+    axis_size += 1;
+  }
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", axis);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
--- a/paddle/fluid/inference/anakin/convert/transpose.h
+++ b/paddle/fluid/inference/anakin/convert/transpose.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class TransposeOpConverter : public AnakinOpConverter {
+ public:
+  TransposeOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~TransposeOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <gtest/gtest.h>
 #include <map>
 #include <memory>
 #include <string>
@@ -24,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 #include "paddle/fluid/inference/anakin/engine.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -82,7 +84,7 @@ class AnakinConvertValidation {
  AnakinConvertValidation() = delete;
  AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
-                          const framework::Scope& scope)
+                          framework::Scope* scope)
      : parameters_(parameters), scope_(scope), place_(0) {
    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
    engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
@@ -106,7 +108,7 @@ class AnakinConvertValidation {
  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
    platform::CUDADeviceContext ctx(place_);
-    auto* x = scope_.Var(name);
+    auto* x = scope_->Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
    RandomizeTensor(x_tensor, place_, ctx);
@@ -118,15 +120,22 @@ class AnakinConvertValidation {
    // should init anakin engine here.
    Singleton<AnakinOpConverter>::Global().ConvertOp(
-        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+        desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
    engine_->Freeze();
+    std::map<std::string, std::vector<int>> temp_max_input_shape;
    for (const auto& input : op_desc_->InputArgumentNames()) {
      if (parameters_.count(input)) continue;
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(*scope_,
                                                                        input);
      auto t_shape = framework::vectorize2int(t.dims());
+      while (t_shape.size() < 4) {
+        t_shape.push_back(1);
+      }
      engine_->SetInputShape(input, t_shape);
+      temp_max_input_shape[input] = t_shape;
    }
+    engine_->SetMaxInputShape(temp_max_input_shape);
    engine_->Optimize();
    engine_->InitGraph();
  }
@@ -138,14 +147,14 @@ class AnakinConvertValidation {
               std::unordered_set<std::string> neglected_output = {}) {
    // Execute Fluid Op
    platform::CUDADeviceContext ctx(place_);
-    op_->Run(scope_, place_);
+    op_->Run(*scope_, place_);
    // std::vector<framework::LoDTensor> input_vector;
    // std::vector<framework::LoDTensor> output_vector;
    std::map<std::string, framework::LoDTensor*> inputs;
    for (const auto& input : op_desc_->InputArgumentNames()) {
      if (parameters_.count(input)) continue;
-      auto* var = scope_.FindVar(input);
+      auto* var = scope_->FindVar(input);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      inputs.insert({input, tensor});
    }
@@ -155,45 +164,38 @@ class AnakinConvertValidation {
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;
      std::vector<float> fluid_out;
-      auto* var = scope_.FindVar(output);
+      auto* var = scope_->FindVar(output);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      framework::TensorToVector(*tensor, ctx, &fluid_out);
      fluid_outputs.push_back(fluid_out);
-      // size_t fluid_out_size = fluid_out.size();
-      /*for (size_t i = 0; i < fluid_out_size; i++) {
-        std::cout << fluid_out[i] << std::endl;
-      }*/
      outputs.insert({output, tensor});
    }
-    engine_->Execute(inputs, outputs);
+    engine_->Execute(inputs, outputs, stream_);
    int i_output = 0;
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;
      std::vector<float> anakin_out;
-      auto* var = scope_.FindVar(output);
+      auto* var = scope_->FindVar(output);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      framework::TensorToVector(*tensor, ctx, &anakin_out);
      size_t anakin_out_size = anakin_out.size();
      auto fluid_out = fluid_outputs[i_output++];
      for (size_t i = 0; i < anakin_out_size; i++) {
-        LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], "
+        EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3);
-                  << "fluid[" << fluid_out[i] << "]";
      }
    }
  }
-  framework::Scope& scope() { return scope_; }
 private:
  std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
  cudaStream_t stream_;
  std::unique_ptr<framework::OperatorBase> op_;
  std::unique_ptr<framework::OpDesc> op_desc_;
  const std::unordered_set<std::string>& parameters_;
-  framework::Scope& scope_;
+  framework::Scope* scope_;
  platform::CUDAPlace place_;
 };

--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -33,9 +33,15 @@ namespace inference {
 namespace anakin {
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary)
+AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
+    bool need_summary, int device, int max_batch_size,
+    std::map<std::string, std::vector<int>> max_input_shape)
    : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
-      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {}
+      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
+  device_ = device;
+  max_batch_size_ = max_batch_size;
+  max_input_shape_ = max_input_shape;
+}
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {}
@@ -63,34 +69,53 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
    const std::map<std::string, framework::LoDTensor *> &inputs,
-    const std::map<std::string, framework::LoDTensor *> &outputs) {
+    const std::map<std::string, framework::LoDTensor *> &outputs,
+    cudaStream_t stream) {
+  cudaDeviceSynchronize();
  for (const auto &input : inputs) {
    auto *tensor = input.second;
    auto *data = tensor->data<float>();
-    auto shape = framework::vectorize2int(tensor->dims());
-    ::anakin::saber::Shape anakin_shape(shape);
+    auto fluid_input_shape = framework::vectorize2int(tensor->dims());
+    while (fluid_input_shape.size() < 4) {
+      fluid_input_shape.push_back(1);
+    }
    auto *anakin_input = net_->get_in(input.first);
+    std::vector<int> max_input_shape = max_input_shape_[input.first];
+    int max_shape_sum =
+        std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
+                        std::multiplies<int>());
+    PADDLE_ENFORCE(max_shape_sum >= tensor->numel(),
+                   "The anakin input max shape should be greater than"
+                   " or equal to the real input shape, Please set the max "
+                   "input shape using EnableAnakinEngine");
+    anakin_input->reshape(fluid_input_shape);
    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_shape);
+                                                       fluid_input_shape);
-    anakin_input->share_from(tmp_anakin_tensor);
+    anakin_input->copy_from(tmp_anakin_tensor);
  }
+  net_->prediction();
+  cudaDeviceSynchronize();
  for (const auto &output : outputs) {
+    platform::CUDAPlace gpu_place(device_);
    auto *tensor = output.second;
-    auto *data = tensor->data<float>();
-    auto shape = framework::vectorize2int(tensor->dims());
-    ::anakin::saber::Shape anakin_shape(shape);
    auto *anakin_output = net_->get_out(output.first);
-    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
+    auto *anakin_data = anakin_output->data();
-                                                       anakin_shape);
+    auto anakin_output_shape = anakin_output->valid_shape();
-    anakin_output->share_from(tmp_anakin_tensor);
+    tensor->Resize(framework::make_ddim(anakin_output_shape));
+    auto *fluid_data = tensor->mutable_data<float>(gpu_place);
+    memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
+                 static_cast<void *>(anakin_data),
+                 tensor->numel() * sizeof(float), stream);
  }
-  net_->prediction();
+  cudaDeviceSynchronize();
 }
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
-  PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph.");
+  PADDLE_ENFORCE(graph_->Freeze_v3(), "Freeze anakin subgraph.");
 }
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>

--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -15,9 +15,11 @@
 #pragma once
 #include <algorithm>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/engine.h"
@@ -26,8 +28,12 @@
 #include "framework/core/net/net.h"
 #include "framework/core/types.h"
 #include "framework/graph/graph.h"
+#include "framework/graph/graph_global_mem.h"
 #include "saber/saber_types.h"
+using anakin::Precision;
+using anakin::saber::NV;
 namespace anakin {
 template <typename, Precision, OpRunType>
@@ -46,8 +52,13 @@ namespace anakin {
 template <typename TargetT, ::anakin::Precision PrecisionType,
          ::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC>
 class AnakinEngine {
+  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
+  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
 public:
-  explicit AnakinEngine(bool need_summary = false);
+  explicit AnakinEngine(
+      bool need_summary = false, int device = 0, int max_batch_size = 1,
+      std::map<std::string, std::vector<int>> max_input_shape = {});
  ~AnakinEngine();
  void InitGraph();
  void SetInputShape(const std::string &name, std::vector<int> shape);
@@ -61,20 +72,72 @@ class AnakinEngine {
    PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value),
                   "Add operation's attribution.");
  }
+  NetT *Net() { return net_.get(); }
+  GraphT *Graph() { return graph_.get(); }
  std::unique_ptr<AnakinEngine> Clone();
+  const std::map<std::string, std::vector<int>> &GetMaxInputShape() {
+    return max_input_shape_;
+  }
+  void SetMaxInputShape(std::map<std::string, std::vector<int>> shape) {
+    max_input_shape_ = shape;
+  }
+  int GetMaxBatchSize() { return max_batch_size_; }
  void Freeze();
  void Optimize();
+  void AllocTmpMem() {
+    PADDLE_ENFORCE(net_->alloc_memory_first(*graph_),
+                   "anakin alloc temp memory first failed");
+  }
+  void Save(std::string path) { graph_->save(path); }
+  bool IsInit() { return initialized_; }
+  int GetDevice() { return device_; }
  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
-               const std::map<std::string, framework::LoDTensor *> &outputs);
+               const std::map<std::string, framework::LoDTensor *> &outputs,
+               cudaStream_t stream);
 private:
-  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
+  bool initialized_{false};
-  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
+  int max_batch_size_;
+  std::map<std::string, std::vector<int>> max_input_shape_;
+  int device_;
  std::unique_ptr<GraphT> graph_;
  std::unique_ptr<NetT> net_;
 };
+class AnakinEngineManager {
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+ public:
+  bool HasEngine(const std::string &name) const {
+    if (engines_.count(name) == 0) return false;
+    return engines_.at(name).get() != nullptr;
+  }
+  AnakinNvEngineT *Get(const std::string &name) const {
+    return engines_.at(name).get();
+  }
+  AnakinNvEngineT *Create(
+      bool need_summary, int device, int max_batch_size,
+      std::map<std::string, std::vector<int>> max_input_shape,
+      std::string engine_name) {
+    std::unique_lock<std::mutex> lk(mut_);
+    auto *p = new AnakinEngine<NV, Precision::FP32>(
+        need_summary, device, max_batch_size, max_input_shape);
+    engines_[engine_name].reset(p);
+    return p;
+  }
+  void DeleteALL() {
+    for (auto &item : engines_) {
+      item.second.reset(nullptr);
+    }
+  }
+ private:
+  std::unordered_map<std::string, std::unique_ptr<AnakinNvEngineT>> engines_;
+  std::mutex mut_;
+};
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/op_teller.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+// Just tell by the op_types.
+struct SimpleOpTypeSetTeller : public Teller {
+  SimpleOpTypeSetTeller() {
+    teller_set.insert("mul");
+    teller_set.insert("fc");
+    teller_set.insert("conv2d_fusion");
+    teller_set.insert("split");
+    teller_set.insert("relu");
+    teller_set.insert("pool2d");
+    teller_set.insert("elementwise_add");
+    teller_set.insert("elementwise_mul");
+    teller_set.insert("concat");
+    teller_set.insert("tanh");
+    teller_set.insert("conv2d");
+    teller_set.insert("batch_norm");
+    teller_set.insert("softmax");
+    teller_set.insert("flatten2");
+    teller_set.insert("reshape2");
+    teller_set.insert("transpose2");
+    teller_set.insert("density_prior_box");
+    teller_set.insert("detection_out");
+    teller_set.insert("dropout");
+    teller_set.insert("sigmoid");
+    teller_set.insert("sum");
+  }
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& desc) override {
+    return teller_set.count(op_type);
+  }
+ private:
+  std::unordered_set<std::string> teller_set;
+};
+bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
+  for (auto& teller : tellers_) {
+    if ((*teller)(op_type, desc)) return true;
+  }
+  return false;
+}
+OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/op_teller.h
+++ b/paddle/fluid/inference/anakin/op_teller.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+/*
+ * Single Op teller definition.
+ * One can override this and define a more complex tell logic, considerring more
+ * issues such as op_desc.
+ */
+struct Teller {
+  virtual bool operator()(const std::string& op_type,
+                          const framework::OpDesc& desc) = 0;
+  virtual ~Teller() = default;
+};
+/*
+ * A real example:
+ *
+ * struct SomeTeller : public Teller {
+ * bool operator()(const std::string& op_type,
+ *                const framework::OpDesc& desc) override {
+ *  return op_type == "fc" && desc.Inputs().size() == 2;
+ * }
+ *};
+ */
+/*
+ * class OpTeller helps to tell whether a fluid
+ * operator can be transformed to a TensorRT layer.
+ */
+class OpTeller {
+ public:
+  static OpTeller& Global() {
+    static std::unique_ptr<OpTeller> x(new OpTeller);
+    return *x;
+  }
+  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
+ private:
+  OpTeller();
+ private:
+  std::vector<std::unique_ptr<Teller>> tellers_;
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
@@ -17,9 +17,6 @@ limitations under the License. */
 #include <map>
-#include "framework/core/net/net.h"
-#include "framework/graph/graph.h"
-#include "framework/graph/graph_global_mem.h"
 #include "paddle/fluid/inference/anakin/engine.h"
 using anakin::graph::GraphGlobalMem;
@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) {
  auto *y_data = y.mutable_data<float>(platform::CUDAPlace());
  std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}};
-  engine_->Execute(inputs, outputs);
+  cudaStream_t stream;
+  engine_->Execute(inputs, outputs, stream);
  auto *y_data_gpu = y_data;
  float y_data_cpu[2];
  cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -41,8 +42,11 @@ namespace inference {
 namespace analysis {
 using framework::ir::Graph;
+#ifdef PADDLE_WITH_MKLDNN
 using VarQuantScale =
    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
+#endif
 /*
 * The argument definition of both Pass and PassManagers.
@@ -55,6 +59,8 @@ struct Argument {
  using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
  using fusion_statis_t = std::unordered_map<std::string, int>;
+  using engine_opt_info_t = std::map<std::string, std::string>;
+  using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
  bool Has(const std::string& key) const { return valid_fields_.count(key); }
@@ -107,12 +113,14 @@ struct Argument {
 private:                                                                 \
  unique_ptr_t field__##_;
+  DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
  // Model path
  DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
  // Model specified with program and parameters files.
  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
+  DECL_ARGUMENT_FIELD(engine_opt_info, EngineOptInfo, engine_opt_info_t);
  // The overall graph to work on.
  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -132,6 +140,7 @@ struct Argument {
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+#ifdef PADDLE_WITH_MKLDNN
  // A set of op types to enable their quantized kernels
  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
                      std::unordered_set<std::string>);
@@ -142,6 +151,7 @@ struct Argument {
  // Scales for variables to be quantized
  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
+#endif
  // Passed from config.
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
@@ -155,6 +165,11 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                      bool);
+  DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
+                      anakin_max_shape_t);
+  DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -13,9 +13,12 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -61,6 +64,7 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->mkldnn_enabled_op_types()));
+#ifdef PADDLE_WITH_MKLDNN
    } else if (pass_name == "cpu_quantize_placement_pass") {
      pass->Set("quantize_enabled_op_types",
                new std::unordered_set<std::string>(
@@ -71,6 +75,7 @@ void IRPassManager::CreatePasses(Argument *argument,
    } else if (pass_name == "cpu_quantize_pass") {
      pass->Set("quant_var_scales",
                new VarQuantScale(argument->quant_var_scales()));
+#endif
    } else if (pass_name == "tensorrt_subgraph_pass") {
      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
@@ -83,16 +88,40 @@ void IRPassManager::CreatePasses(Argument *argument,
                         AnalysisConfig::Precision::kInt8;
      pass->Set("enable_int8", new bool(enable_int8));
-      std::string model_opt_cache_dir =
-          argument->Has("model_dir")
+      bool use_static_engine = argument->tensorrt_use_static_engine();
-              ? argument->model_dir()
+      bool model_from_memory = argument->model_from_memory();
-              : GetDirRoot(argument->model_program_path());
+      bool int8_valid = !(model_from_memory && enable_int8);
-      pass->Set(
+      PADDLE_ENFORCE(int8_valid,
-          "model_opt_cache_dir",
+                     "TRT INT8 Now don't support model load from memory.");
-          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      if ((!model_from_memory && use_static_engine) || enable_int8) {
+        std::string model_opt_cache_dir =
+            argument->Has("model_dir")
+                ? argument->model_dir()
+                : GetDirRoot(argument->model_program_path());
+        pass->Set(
+            "model_opt_cache_dir",
+            new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      }
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("use_static_engine", new bool(use_static_engine));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
+    }
+    if (pass_name == "anakin_subgraph_pass") {
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
-      pass->Set("use_static_engine",
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
-                new bool(argument->tensorrt_use_static_engine()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->anakin_max_input_shape()));
+      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
    }
    pre_pass = pass_name;

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
-cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc)
 if(WITH_TESTING)
  add_dependencies(subgraph_detector gtest)
 endif()
@@ -14,3 +14,15 @@ if (WITH_GPU AND TENSORRT_FOUND)
  file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
 endif()
+if (ANAKIN_FOUND) 
+  cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
+  set(analysis_deps ${analysis_deps}
+          subgraph_detector anakin_subgraph_pass
+          CACHE INTERNAL "")
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "")
+endif()
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/op_teller.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include "paddle/fluid/string/pretty_log.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
+    std::unique_ptr<framework::ir::Graph> graph) const {
+  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
+  auto teller = [](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
+  };
+  SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */);
+  fuser();
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in anakin, and should not have another copy
+  // in fluid.
+  std::vector<std::string> repetitive_params;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
+      CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
+      std::unordered_set<const Node *> nodes2remove(
+          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+    }
+  }
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));
+  return graph;
+}
+std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs,
+                                    const std::set<std::string> &engine_outputs,
+                                    std::string id) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += id;
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+void AnakinSubgraphPass::CreateAnakinOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
+  auto *op_desc = node->Op();
+  auto &subgraph = *Agent(node).subgraph();
+  PADDLE_ENFORCE(!subgraph.empty());
+  framework::ProgramDesc *program_desc =
+      Get<framework::ProgramDesc *>("program");
+  // Add new block for TensorRTEngineOP
+  const framework::BlockDesc &main_block =
+      program_desc->Block(framework::kRootBlockIndex);
+  // const framework::BlockDesc& main_block = program_desc->Block(0);
+  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
+  // An fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
+  for (auto *node : subgraph) {
+    auto *new_block_op = new_block->AppendOp();
+    auto *op = block_desc.AppendOp();
+    *new_block_op->Proto() = *node->Op()->Proto();
+    *op->Proto() = *node->Op()->Proto();
+  }
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the eigine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
+  }
+  std::copy(params.begin(), params.end(),
+            std::back_inserter(*repetitive_params));
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  for (auto *x : node->outputs) {
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+  op_desc->SetType("anakin_engine");
+  std::unordered_map<std::string, std::string> output_name_map;
+  auto &subgraph_nodes = *Agent(node).subgraph();
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
+                      &output_names_with_id, &output_names, &output_name_map,
+                      false);
+  // When anakin engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+  auto *vars = block_desc.Proto()->mutable_vars();
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      *vars->Add() = *node->Var()->Proto();
+    }
+  }
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
+                 "the block has no var-desc");
+  PADDLE_ENFORCE(!output_mapping.empty());
+  op_desc->SetBlockAttr("sub_block", new_block);
+  SetAttr(op_desc->Proto(), "subgraph",
+          block_desc.Proto()->SerializeAsString());
+  // Set attrs
+  SetAttr(op_desc->Proto(), "parameters", params);
+  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  int predictor_id = Get<int>("predictor_id");
+  auto engine_key = GenerateAnakinEngineKey(
+      input_names_with_id, output_names_with_id, std::to_string(predictor_id));
+  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  auto max_input_shape =
+      Get<std::map<std::string, std::vector<int>>>("max_input_shape");
+  auto max_batch_size = Get<int>("max_batch_size");
+  auto *anakin_engine =
+      inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
+          true, Get<int>("gpu_device_id"), max_batch_size, max_input_shape,
+          engine_key);
+  auto *scope = param_scope();
+  std::unordered_set<std::string> param_set(params.begin(), params.end());
+  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+  inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
+      .ConvertBlockToAnakinEngine(
+          &block_desc_temp, scope,
+          std::vector<std::string>(input_names.begin(), input_names.end()),
+          param_set, output_mapping, anakin_engine);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+REGISTER_PASS(anakin_subgraph_pass,
+              paddle::inference::analysis::AnakinSubgraphPass);
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+using anakin::Precision;
+using anakin::saber::NV;
+namespace paddle {
+namespace inference {
+namespace analysis {
+class AnakinSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  std::unique_ptr<framework::ir::Graph> ApplyImpl(
+      std::unique_ptr<framework::ir::Graph> graph) const override;
+ private:
+  void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                      const std::vector<std::string> &graph_params,
+                      std::vector<std::string> *repetitive_params) const;
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -418,7 +420,7 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
    // as deleted. 3. Replace the deleted node with the new Block Node.
    framework::OpDesc empty_desc;
-    empty_desc.SetType("tensorrt_engine");
+    empty_desc.SetType("anakin_engine");
    auto *block_node = graph_->CreateOpNode(&empty_desc);
    Agent(block_node).set_subgraph({});
    auto io = ExtractInputAndOutputOfSubGraph(subgraph);

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file defines the the class to partition a graph.
+ */
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include <algorithm>
+#include <string>
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes) {
+  // We can judge whether a variable is a parameter by
+  // its presistable property, but sometimes the presistable
+  // of the feed op output is true, so we have to identify it.
+  std::vector<std::string> feed_outputs;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    std::string op_type = node->Op()->Type();
+    if (op_type == "feed" || op_type == "fetch") {
+      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
+      std::copy(output_names.begin(), output_names.end(),
+                std::back_inserter(feed_outputs));
+    }
+  }
+  std::vector<std::string> parameters;
+  for (const auto &node : nodes) {
+    if (!node->IsVar()) continue;
+    if (node->Var()->Persistable() &&
+        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+            feed_outputs.end()) {
+      parameters.push_back(node->Name());
+    }
+  }
+  return parameters;
+}
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map,
+    bool is_trt) {
+  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      same_hierarchy_conv2d_num_map;
+  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
+    framework::OpDesc op_desc(*op, nullptr);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+    std::unordered_map<std::string, size_t> var2id;
+    std::unordered_map<std::string, framework::ir::Node *> in_vars;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+      in_vars[in_var->Name()] = in_var;
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+    if (op_desc.Type() == "conv2d" && is_trt) {
+      auto input_var_name = op_desc.Input("Input").front();
+      auto filter_var_name = op_desc.Input("Filter").front();
+      auto out_var_name = op_desc.Output("Output").front();
+      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
+      const std::vector<int> strides =
+          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+      const std::vector<int> paddings =
+          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
+        (*output_names_with_id)
+            .insert(out_var_name + std::to_string(var2id[out_var_name]));
+        (*output_names).insert(out_var_name);
+      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
+                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
+                 paddings[1] == 0) {
+        same_hierarchy_conv2d_num_map[input_var_name] += 1;
+      }
+    }
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id->count(arg_value_with_id)) {
+          (*output_name_map)[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file defines the the class to partition a graph.
+ */
+#pragma once
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/node.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes);
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map,
+    bool is_trt = true);
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <algorithm>
+#include <map>
 #include <set>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,17 +31,6 @@ namespace analysis {
 using framework::ir::Node;
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes);
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map);
 std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
@@ -209,186 +199,86 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  SetAttr(op_desc->Proto(), "parameters", params);
  auto enable_int8 = Get<bool>("enable_int8");
+  auto use_static_engine = Get<bool>("use_static_engine");
  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
                                      std::to_string(0));
  // Get "" when there is no cached calibration table data.
-  std::string calibration_data = GetTrtCalibTableData(
+  bool load_from_memory = Get<bool>("model_from_memory");
-      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  std::string calibration_data = "";
+  if (enable_int8) {
+    calibration_data = GetTrtCalibTableData(
+        Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  }
  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+  std::string trt_engine_serialized_data = "";
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);
  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
  if (enable_int8 && calibration_data.size() != 0) {
    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
  }
-  bool use_static_engine = Get<bool>("use_static_engine");
  // When in int8 mode and calibration_mode, the program just produce the
  // calibration table data.
  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
-  if (!calibration_mode && use_static_engine) {
+  if (calibration_mode) {
-    std::copy(params.begin(), params.end(),
+    // calibraion mode means generate int8 calibration table data process.
-              std::back_inserter(*repetitive_params));
+    return;
-    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+  }
-        Get<std::string>("model_opt_cache_dir"), engine_key);
-    if (trt_engine_serialized_data.empty()) {
+  std::copy(params.begin(), params.end(),
-      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+            std::back_inserter(*repetitive_params));
-                   "kernel etc). This process may cost a lot of time.";
+  bool need_serialize = (use_static_engine && !load_from_memory);
-      std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
-          new tensorrt::TensorRTEngine(
+  if (need_serialize) {
-              Get<int>("max_batch_size"), Get<int>("workspace_size"),
+    trt_engine_serialized_data = GetTrtEngineSerializedData(
-              enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
+        Get<std::string>("model_opt_cache_dir"), engine_key);
-      auto *scope = param_scope();
+    // we can load the engine info serialized before from the disk.
-      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+    if (!trt_engine_serialized_data.empty()) {
-      std::unordered_set<std::string> param_set(params.begin(), params.end());
+      SetAttr(op_desc->Proto(), "engine_serialized_data",
-      inference::Singleton<inference::tensorrt::OpConverter>::Global()
+              trt_engine_serialized_data);
-          .ConvertBlockToTRTEngine(
-              &block_desc_temp, *scope,
-              std::vector<std::string>(input_names.begin(), input_names.end()),
-              param_set, output_mapping, trt_engine.get());
-      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-      trt_engine_serialized_data =
-          std::string((const char *)serialized_engine_data->data(),
-                      serialized_engine_data->size());
-      SaveTrtEngineSerializedDataToFile(
-          GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
-                                     engine_key),
-          trt_engine_serialized_data);
-    } else {
      LOG(INFO) << "Load TRT Optimized Info from "
                << GetTrtEngineSerializedPath(
                       Get<std::string>("model_opt_cache_dir"), engine_key);
-    }
+      return;
-    SetAttr(op_desc->Proto(), "engine_serialized_data",
-            trt_engine_serialized_data);
-  }
-}
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes) {
-  // We can judge whether a variable is a parameter by
-  // its presistable property, but sometimes the presistable
-  // of the feed op output is true, so we have to identify it.
-  std::vector<std::string> feed_outputs;
-  for (const auto &node : nodes) {
-    if (!node->IsOp()) continue;
-    std::string op_type = node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") {
-      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
-      std::copy(output_names.begin(), output_names.end(),
-                std::back_inserter(feed_outputs));
    }
  }
-  std::vector<std::string> parameters;
+  // the following code will NOT run in following situation:
-  for (const auto &node : nodes) {
+  // 1. calibraion mode (generate trt int8 calibraiton table data)
-    if (!node->IsVar()) continue;
+  // 2. already load serialized trt engine info.
-    if (node->Var()->Persistable() &&
+  LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+               "kernel etc). This process may cost a lot of time.";
-            feed_outputs.end()) {
+  std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
-      parameters.push_back(node->Name());
+      new tensorrt::TensorRTEngine(
-    }
+          Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
-  }
+          calibrator.get(), Get<int>("gpu_device_id")));
-  return parameters;
+  auto *scope = param_scope();
-}
+  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+  std::unordered_set<std::string> param_set(params.begin(), params.end());
-void RenameAndGetOutputs(
+  inference::Singleton<inference::tensorrt::OpConverter>::Global()
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
+      .ConvertBlockToTRTEngine(
-    framework::BlockDesc *block_desc,
+          &block_desc_temp, *scope,
-    const std::set<std::string> &input_names_with_id,
+          std::vector<std::string>(input_names.begin(), input_names.end()),
-    std::set<std::string> *output_names_with_id,
+          param_set, output_mapping, trt_engine.get());
-    std::set<std::string> *output_names,
+  nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-    std::unordered_map<std::string, std::string> *output_name_map) {
+  trt_engine_serialized_data =
-  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+      std::string((const char *)serialized_engine_data->data(),
-  // When there are more than two convolutions of 1 * 1 with the same input, the
+                  serialized_engine_data->size());
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  if (need_serialize) {
-  // this optimization for the time being. This bug will be fixed in the future.
+    SaveTrtEngineSerializedDataToFile(
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+        GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
-      same_hierarchy_conv2d_num_map;
+                                   engine_key),
+        trt_engine_serialized_data);
-  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
-    framework::OpDesc op_desc(*op, nullptr);
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-    std::unordered_map<std::string, size_t> var2id;
-    std::unordered_map<std::string, framework::ir::Node *> in_vars;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-      in_vars[in_var->Name()] = in_var;
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-    if (op_desc.Type() == "conv2d") {
-      auto input_var_name = op_desc.Input("Input").front();
-      auto filter_var_name = op_desc.Input("Filter").front();
-      auto out_var_name = op_desc.Output("Output").front();
-      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
-      const std::vector<int> strides =
-          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-      const std::vector<int> paddings =
-          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
-        (*output_names_with_id)
-            .insert(out_var_name + std::to_string(var2id[out_var_name]));
-        (*output_names).insert(out_var_name);
-      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
-                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
-                 paddings[1] == 0) {
-        same_hierarchy_conv2d_num_map[input_var_name] += 1;
-      }
-    }
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id->count(arg_value_with_id)) {
-          (*output_name_map)[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
  }
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,6 +27,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
+if (ANAKIN_FOUND)
+    set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
+endif()
 add_subdirectory(details)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
+extern const std::vector<std::string> kAnakinSubgraphPasses;
 PassStrategy *AnalysisConfig::pass_builder() const {
  if (!pass_builder_.get()) {
@@ -108,6 +109,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
+  CP_MEMBER(use_anakin_);
+  CP_MEMBER(anakin_max_batchsize_);
+  CP_MEMBER(anakin_max_input_shape_);
  // Ir related.
  CP_MEMBER(enable_ir_optim_);
  CP_MEMBER(use_feed_fetch_ops_);
@@ -230,6 +235,20 @@ void AnalysisConfig::Update() {
    }
  }
+  if (use_anakin_) {
+    PADDLE_ENFORCE(!use_tensorrt_,
+                   "Anakin sub-graph and TensorRT sub-graph are not allowed to "
+                   "run at the same time!");
+    PADDLE_ENFORCE(
+        use_gpu_,
+        "Anakin sub-graph engine need gpu, please use the EnableGpu API.");
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kAnakinSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
@@ -266,7 +285,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << specify_input_name_;
  ss << cpu_math_library_num_threads_;
+  ss << use_anakin_;
  return ss.str();
 }
@@ -316,6 +335,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
  Update();
 }
+void AnalysisConfig::SetEngineOptInfo(
+    std::map<std::string, std::string> engine_opt_info) {
+  engine_opt_info_ = engine_opt_info;
+}
 NativeConfig AnalysisConfig::ToNativeConfig() const {
  NativeConfig config;
  config.model_dir = model_dir_;
@@ -332,5 +356,12 @@ void AnalysisConfig::SwitchIrDebug(int x) {
  ir_debug_ = x;
  Update();
 }
+void AnalysisConfig::EnableAnakinEngine(
+    int max_batch_size,
+    std::map<std::string, std::vector<int>> max_input_shape) {
+  anakin_max_batchsize_ = max_batch_size;
+  anakin_max_input_shape_ = max_input_shape;
+  use_anakin_ = true;
+  Update();
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -40,7 +40,10 @@
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
+#endif
+#if PADDLE_WITH_ANAKIN
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 #endif
 DECLARE_bool(profile);
@@ -349,7 +352,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetStaticMemoryOptimForceUpdate(
      config_.static_memory_optim_force_update_);
  argument_.SetModelFromMemory(config_.model_from_memory_);
+  argument_.SetEngineOptInfo(config_.engine_opt_info_);
  // Analyze inference_program
+  argument_.SetUseAnakin(config_.anakin_engine_enabled());
+  argument_.SetPredictorID(predictor_id_);
  if (!config_.model_dir().empty()) {
    argument_.SetModelDir(config_.model_dir());
  } else {
@@ -373,6 +379,12 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
  }
+  if (config_.use_gpu() && config_.anakin_engine_enabled()) {
+    argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
+    argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
+    LOG(INFO) << "Anakin subgraph engine is enabled";
+  }
  if (config_.use_mkldnn_) {
    LOG(INFO) << "MKLDNN is enabled";
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
@@ -402,7 +414,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu()) {
    // 1. GPU memory
-    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
+    PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                      config.gpu_device_id());
    std::vector<std::string> flags;
@@ -805,3 +817,27 @@ USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 #endif
+#if PADDLE_WITH_ANAKIN
+USE_ANAKIN_CONVERTER(mul);
+USE_ANAKIN_CONVERTER(fc);
+USE_ANAKIN_CONVERTER(conv2d);
+USE_ANAKIN_CONVERTER(conv2d_fusion);
+USE_ANAKIN_CONVERTER(concat);
+USE_ANAKIN_CONVERTER(split);
+USE_ANAKIN_CONVERTER(relu);
+USE_ANAKIN_CONVERTER(sigmoid);
+USE_ANAKIN_CONVERTER(tanh);
+USE_ANAKIN_CONVERTER(pool2d);
+USE_ANAKIN_CONVERTER(elementwise_add);
+USE_ANAKIN_CONVERTER(elementwise_mul);
+USE_ANAKIN_CONVERTER(batch_norm);
+USE_ANAKIN_CONVERTER(flatten);
+USE_ANAKIN_CONVERTER(reshape);
+USE_ANAKIN_CONVERTER(transpose);
+USE_ANAKIN_CONVERTER(softmax);
+USE_ANAKIN_CONVERTER(detection_out);
+USE_ANAKIN_CONVERTER(density_prior_box);
+USE_ANAKIN_CONVERTER(dropout);
+USE_ANAKIN_CONVERTER(sum);
+#endif
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -45,7 +45,9 @@ using framework::NaiveExecutor;
 */
 class AnalysisPredictor : public PaddlePredictor {
 public:
-  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+  }
  ~AnalysisPredictor();
  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor {
  const size_t max_shape_collect_count_{1000};
  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
+  int predictor_id_;
 private:
  // Some status here that help to determine the status inside the predictor.

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -74,6 +74,21 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
  return res;
 }
+PaddleDType ZeroCopyTensor::type() const {
+  EAGER_GET_TENSOR;
+  auto type = tensor->type();
+  if (type == framework::proto::VarType::FP32) {
+    return PaddleDType::FLOAT32;
+  } else if (type == framework::proto::VarType::INT64) {
+    return PaddleDType::INT64;
+  } else if (type == framework::proto::VarType::INT32) {
+    return PaddleDType::INT32;
+  } else {
+    LOG(ERROR) << "unknown type, only support float32 and int64 now.";
+  }
+  return PaddleDType::FLOAT32;
+}
 template <typename T>
 void ZeroCopyTensor::copy_from_cpu(const T *data) {
  EAGER_GET_TENSOR;
@@ -119,6 +134,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+    cudaDeviceSynchronize();
 #else
    PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -14,9 +14,11 @@
 #pragma once
 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 /*! \file */
@@ -136,10 +138,20 @@ struct AnalysisConfig {
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1, int min_subgraph_size = 3,
                            Precision precision = Precision::kFloat32,
-                            bool use_static = true);
+                            bool use_static = false);
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  /**
+   *  \brief Turn on the usage of Anakin sub-graph engine.
+   */
+  void EnableAnakinEngine(
+      int max_batch_size = 1,
+      std::map<std::string, std::vector<int>> max_input_shape = {});
+  /** A boolean state indicating whether the Anakin sub-graph engine is used.
+  */
+  bool anakin_engine_enabled() const { return use_anakin_; }
  /** \brief Control whether to debug IR graph analysis phase.
   *
@@ -185,6 +197,7 @@ struct AnalysisConfig {
  /** A boolean state telling whether the model is set from the CPU memory.
   */
  bool model_from_memory() const { return model_from_memory_; }
+  void SetEngineOptInfo(std::map<std::string, std::string> engine_opt_info);
  /** Turn on memory optimize
   * NOTE still in development, will release latter.
@@ -258,6 +271,10 @@ struct AnalysisConfig {
  std::string serialized_info_cache_;
  mutable std::unique_ptr<PassStrategy> pass_builder_;
+  bool use_anakin_{false};
+  int anakin_max_batchsize_;
+  std::map<std::string, std::vector<int>> anakin_max_input_shape_;
+  std::map<std::string, std::string> engine_opt_info_;
 };
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -177,6 +177,8 @@ class ZeroCopyTensor {
    device_ = device;
  }
+  PaddleDType type() const;
 protected:
  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
  void SetName(const std::string& name) { name_ = name; }
@@ -191,6 +193,7 @@ class ZeroCopyTensor {
  // performance.
  mutable void* tensor_{nullptr};
  PaddlePlace place_;
+  PaddleDType dtype_;
  int device_;
 };

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -68,10 +68,26 @@ void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
 }
+// The following passes works for Anakin sub-graph engine.
+const std::vector<std::string> kAnakinSubgraphPasses({
+    "infer_clean_graph_pass",                   //
+    "simplify_anakin_detection_pattern_pass5",  //
+    "simplify_anakin_detection_pattern_pass4",  //
+    "simplify_anakin_detection_pattern_pass3",  //
+    "simplify_anakin_detection_pattern_pass2",  //
+    "anakin_fillconstant_elementwisemul_fuse",  //
+    "fc_fuse_pass",                             //
+    "conv_elementwise_add_fuse_pass",           //
+    "conv_bn_fuse_pass",                        //
+    "conv_elementwise_add_fuse_pass",           //
+    "fc_gru_fuse_pass",                         //
+    "anakin_subgraph_pass",
+});
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
-    "infer_clean_graph_pass",                        //
+    "infer_clean_graph_pass",  //
-        "identity_scale_op_clean_pass",              //
+        //   "identity_scale_op_clean_pass",              //
        "conv_affine_channel_fuse_pass",             //
        "conv_eltwiseadd_affine_channel_fuse_pass",  //
        "conv_bn_fuse_pass",                         //
@@ -84,7 +100,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
 #endif
  });
-  for (int i = 6; i >= 3; i--) {
+  for (int i = 6; i >= 2; i--) {
    passes_.push_back("transpose_flatten" + std::to_string(i) +
                      "_concat_fuse_pass");
  }
@@ -124,4 +140,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
  });
  use_gpu_ = false;
 }
+void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,7 @@ class PaddlePassBuilder {
  /** Delete all the passes that has type `pass_type`. */
  void DeletePass(const std::string &pass_type);
+  void ClearPasses();
  /** Append an analysis pass. */
  void AppendAnalysisPass(const std::string &pass);
@@ -157,4 +158,6 @@ class GpuPassStrategy : public PassStrategy {
  virtual ~GpuPassStrategy() = default;
 };
+extern const std::vector<std::string> kAnakinSubgraphPasses;
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -45,7 +45,7 @@ class EngineIOConverter {
  static void ConvertInput(const std::string& op_type, const LoDTensor& in,
                           void* out, size_t max_size, cudaStream_t* stream) {
    PADDLE_ENFORCE(stream != nullptr);
-    auto* converter = Registry<EngineIOConverter>::Lookup(
+    auto* converter = Registry<EngineIOConverter>::Global().Lookup(
        op_type, "default" /* default_type */);
    PADDLE_ENFORCE_NOT_NULL(converter);
    converter->SetStream(stream);
@@ -56,7 +56,7 @@ class EngineIOConverter {
                            LoDTensor* out, size_t max_size,
                            cudaStream_t* stream) {
    PADDLE_ENFORCE(stream != nullptr);
-    auto* converter = Registry<EngineIOConverter>::Lookup(
+    auto* converter = Registry<EngineIOConverter>::Global().Lookup(
        op_type, "default" /* default_type */);
    PADDLE_ENFORCE_NOT_NULL(converter);
    converter->SetStream(stream);
@@ -69,12 +69,12 @@ class EngineIOConverter {
  cudaStream_t* stream_{nullptr};
 };
-#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__)        \
+#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__)                 \
-  struct trt_io_##op_type__##_converter {                             \
+  struct trt_io_##op_type__##_converter {                                      \
-    trt_io_##op_type__##_converter() {                                \
+    trt_io_##op_type__##_converter() {                                         \
-      Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
+      Registry<EngineIOConverter>::Global().Register<Converter__>(#op_type__); \
-    }                                                                 \
+    }                                                                          \
-  };                                                                  \
+  };                                                                           \
  trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -86,7 +86,7 @@ class OpConverter {
      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
      std::string Y = op_desc.Input("Y")[0];
      if (parameters.count(Y)) {
-        it = Registry<OpConverter>::Lookup("fc");
+        it = Registry<OpConverter>::Global().Lookup("fc");
      }
    }
    if (op_desc.Type().find("elementwise") != std::string::npos) {
@@ -103,28 +103,28 @@ class OpConverter {
      if (parameters.count(Y)) {
        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
                       "Unsupported elementwise type" + op_type);
-        it =
+        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+                                                    "_weight");
        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                                op_desc.Type());
      } else {
        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
                       "Unsupported elementwise type" + op_type);
-        it =
+        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
+                                                    "_tensor");
      }
      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                              op_desc.Type());
    }
    if (op_desc.Type() == "depthwise_conv2d") {
-      it = Registry<OpConverter>::Lookup("conv2d");
+      it = Registry<OpConverter>::Global().Lookup("conv2d");
      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                              op_desc.Type());
    }
    if (!it) {
-      it = Registry<OpConverter>::Lookup(op_desc.Type());
+      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
    }
    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                            op_desc.Type());
@@ -198,9 +198,9 @@ class OpConverter {
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
    trt_##op_type__##_converter() {                                            \
-      ::paddle::inference::                                                    \
+      ::paddle::inference::Registry<                                           \
-          Registry<paddle::inference::tensorrt::OpConverter>::Register<        \
+          paddle::inference::tensorrt::OpConverter>::Global()                  \
-              ::paddle::inference::tensorrt::Converter__>(#op_type__);         \
+          .Register<::paddle::inference::tensorrt::Converter__>(#op_type__);   \
    }                                                                          \
  };                                                                           \
  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \

--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -214,23 +214,28 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 // Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
+// void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
+//   AnalysisConfig cfg;
-  SetConfig(&cfg);
+//   SetConfig(&cfg);
-  if (use_mkldnn) {
+//   if (use_mkldnn) {
-    cfg.EnableMKLDNN();
+//     cfg.EnableMKLDNN();
-  }
+//   }
+//
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
+//   std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
+//   SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
+//   CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+//       reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-}
+//       input_slots_all);
+// }
-TEST(Analyzer_Transformer, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
+// TODO(yihuaxu):
-TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+//    Disable compare and compare_mkldnn temporary, see
-#endif
+//    https://github.com/paddlePaddle/Paddle/issues/16316 for details.
+// TEST(Analyzer_Transformer, compare) { compare(); }
+// #ifdef PADDLE_WITH_MKLDNN
+// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
+// }
+// #endif
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -45,13 +45,13 @@ struct Registry {
  }
  template <typename ItemChild>
-  static void Register(const std::string& name) {
+  void Register(const std::string& name) {
    PADDLE_ENFORCE_EQ(items_.count(name), 0);
    items_[name] = new ItemChild;
  }
-  static ItemParent* Lookup(const std::string& name,
+  ItemParent* Lookup(const std::string& name,
-                            const std::string& default_name = "") {
+                     const std::string& default_name = "") {
    auto it = items_.find(name);
    if (it == items_.end()) {
      if (default_name == "")
@@ -70,11 +70,8 @@ struct Registry {
 private:
  Registry() = default;
-  static std::unordered_map<std::string, ItemParent*> items_;
+  std::unordered_map<std::string, ItemParent*> items_;
 };
-template <typename ItemParent>
-std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -34,6 +34,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
    add_subdirectory(tensorrt)
 endif()
+if (ANAKIN_FOUND) 
+    add_subdirectory(anakin)
+endif()
 SET(OP_HEADER_DEPS xxhash)
 if (WITH_GPU)
    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)

--- a/paddle/fluid/operators/anakin/CMakeLists.txt
+++ b/paddle/fluid/operators/anakin/CMakeLists.txt
+op_library(anakin_engine_op DEPS anakin_engine anakin_op_converter)
+# file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(anakin_engine);\n")
--- a/paddle/fluid/operators/anakin/anakin_engine_op.cc
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_CUDA
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/anakin/anakin_engine_op.h"
+namespace paddle {
+namespace operators {
+class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different TRT Engines");
+    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
+    AddComment("Anakin engine operator.");
+  }
+};
+class AnakinEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(anakin_engine, ops::AnakinEngineOp, ops::AnakinEngineOpMaker,
+                  ops::AnakinEngineOpMaker);
+#endif  // PADDLE_WITH_CUDA
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+namespace paddle {
+namespace operators {
+using FluidDT = framework::proto::VarType_Type;
+using inference::Singleton;
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+using inference::anakin::AnakinEngine;
+class AnakinEngineOp : public framework::OperatorBase {
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  mutable AnakinNvEngineT *anakin_engine_;
+  std::string engine_key_;
+  std::string engine_serialized_data_;
+ public:
+  AnakinEngineOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    engine_key_ = Attr<std::string>("engine_key");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+    anakin_engine_ = nullptr;
+  }
+ protected:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunAnakin(scope, dev_place);
+  }
+  void RunAnakin(const framework::Scope &scope,
+                 const platform::Place &dev_place) const {
+    auto *engine = GetEngine(scope, dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
+    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
+    std::vector<std::string> output_maps =
+        Attr<std::vector<std::string>>("output_name_mapping");
+    std::map<std::string, framework::LoDTensor *> inputs;
+    // Convert input tensor from fluid to engine.
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      inputs.insert({x, &t});
+    }
+    std::map<std::string, framework::LoDTensor *> outputs;
+    int output_index = 0;
+    for (const auto &y : Outputs("Ys")) {
+      auto *fluid_v = scope.FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      outputs.insert({output_maps[output_index], fluid_t});
+      output_index += 1;
+    }
+    engine->Execute(inputs, outputs, stream);
+  }
+  AnakinNvEngineT *GetEngine(const framework::Scope &scope,
+                             const platform::Place &dev_place) const {
+    if (anakin_engine_ == nullptr) {
+      anakin_engine_ =
+          inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
+              .Get(engine_key_);
+    }
+    return anakin_engine_;
+  }
+  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
+               AnakinNvEngineT *engine) const {
+    LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
+    framework::proto::BlockDesc block_desc;
+    block_desc.ParseFromString(Attr<std::string>("subgraph"));
+    std::vector<std::string> output_maps =
+        Attr<std::vector<std::string>>("output_name_mapping");
+    inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
+        .ConvertBlock(block_desc, param_names_, scope, engine);
+    engine->Freeze();
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      auto t_shape = framework::vectorize2int(t.dims());
+      // all input shape should be 4 dims
+      if (t_shape.size() == 2) {
+        t_shape.push_back(1);
+        t_shape.push_back(1);
+      }
+      engine->SetInputShape(x, t_shape);
+    }
+    engine->Optimize();
+    engine->InitGraph();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+#endif  // PADDLE_WITH_CUDA
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -586,14 +586,10 @@ std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
  return std::unique_ptr<framework::OpDesc>(op);
 }
-class BatchNormInplaceInToOut : public framework::InplaceInToOut {
+class BatchNormInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    std::unordered_map<std::string, std::string> inplace_in_to_out = {
        {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"},
    };
@@ -601,14 +597,10 @@ class BatchNormInplaceInToOut : public framework::InplaceInToOut {
  }
 };
-class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {
+class BatchNormGradInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    std::unordered_map<std::string, std::string> inplace_in_to_out = {
        // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C]
        {framework::GradVarName("Y"), framework::GradVarName("X")},

--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -44,6 +44,7 @@ class WhileOp : public framework::OperatorBase {
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
@@ -63,13 +64,34 @@ class WhileOp : public framework::OperatorBase {
    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
-    while (cond.data<bool>()[0]) {
+    if (!is_test) {
+      while (cond.data<bool>()[0]) {
+        auto &current_scope = scope.NewScope();
+        step_scopes->push_back(&current_scope);
+        executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
+                                    true);
+      }
+    } else {
      auto &current_scope = scope.NewScope();
-      step_scopes->push_back(&current_scope);
+      executor.CreateVariables(*program, &current_scope, block->ID());
-      executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
+      while (cond.data<bool>()[0]) {
-      if (is_test) {
+        for (auto &name : current_scope.LocalVarNames()) {
-        scope.DeleteScope(&current_scope);
+          auto *var = current_scope.Var(name);
+          if (var->IsType<framework::LoDTensor>()) {
+            // Clear all lod information for all lod_tensors.
+            auto *t = var->GetMutable<framework::LoDTensor>();
+            framework::LoD empty_lod;
+            t->set_lod(empty_lod);
+          } else if (var->IsType<framework::LoDTensorArray>()) {
+            // Clear elements of all tensor arrays.
+            auto *t = var->GetMutable<framework::LoDTensorArray>();
+            t->clear();
+          }
+        }
+        executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
+                                    false);
      }
+      scope.DeleteScope(&current_scope);
    }
  }
 };

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -252,34 +252,23 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
  }
 };
-class ElementwiseOpInplace : public framework::InplaceInToOut {
+class ElementwiseOpInplace : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    return std::unordered_map<std::string, std::string>{
        {"X", "Out"},
    };
  }
 };
-class ElementwiseGradOpInplace : public framework::InplaceInToOut {
+class ElementwiseGradOpInplace : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
+    return std::unordered_map<std::string, std::string>{
-  std::unordered_map<std::string, std::string> Apply(
+        {framework::GradVarName("Out"), framework::GradVarName("X")},
-      const framework::OpDesc &op_desc,
+    };
-      framework::BlockDesc *block) const override {
-    std::unordered_map<std::string, std::string> ret;
-    if (block->HasVar(framework::GradVarName("X")) &&
-        block->HasVar(framework::GradVarName("Out"))) {
-      ret[framework::GradVarName("Out")] = framework::GradVarName("X");
-    }
-    return ret;
  }
 };

--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -267,14 +267,10 @@ class Flatten2GradOp : public framework::OperatorBase {
  }
 };
-class FlattenOpInplaceInToOut : public framework::InplaceInToOut {
+class FlattenOpInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    std::unordered_map<std::string, std::string> inplace_in_to_out = {
        {"X", "Out"},
    };
@@ -282,13 +278,10 @@ class FlattenOpInplaceInToOut : public framework::InplaceInToOut {
  }
 };
-class FlattenGradInplaceinToOut : public framework::InplaceInToOut {
+class FlattenGradInplaceinToOut : public framework::InplaceOpInference {
-  using InplaceInToOut::InplaceInToOut;
+ public:
+  std::unordered_map<std::string, std::string> operator()(
- protected:
+      const framework::OpDesc &op_desc) const override {
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    std::unordered_map<std::string, std::string> inplace_in_to_out = {
        {framework::GradVarName("Out"), framework::GradVarName("X")},
    };

--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -64,6 +64,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
  const T* p_src = src.data<T>();
+  // why must be int?
  const int* p_index = index.data<int>();
  T* p_output = output->data<T>();

--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/group_norm_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 namespace paddle {
 namespace operators {
@@ -170,26 +172,18 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
  }
 };
-class GroupNormInplaceInToOut : public framework::InplaceInToOut {
+class GroupNormInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    return {{"X", "Y"}};
  }
 };
-class GroupNormGradInplaceInToOut : public framework::InplaceInToOut {
+class GroupNormGradInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
  }
 };

--- a/paddle/fluid/operators/jit/README.en.md
+++ b/paddle/fluid/operators/jit/README.en.md
 # JIT Kernel
 JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
-Each implementations has its own condition to use, defined in `UseMe`.
+Each implementation has its own condition to use, defined in `CanBeUsed`.
 They are combined together to get the best performance of one single independent function.
 They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
 And they can be composed with some other exited jit kernels to build up a complex function. 
@@ -42,35 +42,62 @@ All basical definations of jit kernels are addressed in `paddle/fluid/operators/
 ## How to use
-One simple function `jit::Get`, which is very easy to use, is supported to get the kernel.
+We present these methods to get the functions:
-It can automatically return the expected function with best performance under the given attributes. 
+- `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used.
-All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels.
+- `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations.
+- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute. 
+- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code.
+And here are some examples:
+Get from cache:
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
+    seqpool_func(src_data, dst_data, &attr);
+```
+Get all implementations and run once:
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
+    for (auto f : funcs) {
+        LOG(INFO) << "Kernel implementation type: " << f.first;
+        f.second(src_data, dst_data, &attr);
+    }
+```
+All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels.
 ## Solid Test
 - Unit Test
    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
 - Benchmark
-    All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes.
+    All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes.
 # How to add new kernel
 ## Required
 1. Add `your_key` at `KernelType`.
-2. Add reference function of `your_key`. 
+2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`.
+3. Add reference function of `your_key`. 
 Note:
    - this should be run on CPU and do not depend on any third-party.
    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
-3. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
 Test more data type for some special functions if necessary, for example `int8`.
-4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one.
+5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
 ## Optional
 Add more implementations of `your_kery` for performance enhancement.
-1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
-Note: Add new `KernelTuples` if necessary，your can refer to `XYZNTuples`.
+2. If new attribute type is added, you should specialize `JitCodeKey` of this type.
-Specialie method `JitCodeKey` when add new attribute type。
+3. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
-2. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
 # JIT Kernel
 结合函数模板和JIT生成需要的kernel函数。
-这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`UseMe`函数负责什么条件下可以被调用。
+这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。
 这里实现的函数可以非常细粒度的函数方法，比如Vector MUL， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
 目前仅支持CPU上的高性能计算。
@@ -39,27 +39,55 @@ PaddlePaddle/Paddle/paddle/fluid/
 ## 动态获取
-提供一个`jit::Get`方法，根据kernel类别获取，每种实现都有自己的使用范围，根据范围动态和当前条件选择需要的kernel函数。
+- 提供`GetAllCandidateFuncs`方法，根据输入的kernel类别，获取满足要求的所有函数实现。所有实现保证结果一致，但是速度不一致，可以根据具体输入属性大小，动态测试得到当前最优实现，手动选择最优函数。
+- 提供`GetDefaultBestFunc`方法，返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果，能覆盖大多数情况下最优结果。
+- 提供`KernelFuncs::Cache()`方法，该方法会返回默认最优的函数，同时会缓存该函数指针，如果出现属性一致的情况，直接返回上次的函数指针，如果不存在则根据属性新建。
+- 提供`GetReferFunc` 方法，返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系，有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑，其他所有实现的逻辑与它保持一致。
+### 例子
+所有kernel的调用只需要在头文件中包含`"paddle/fluid/operators/jit/kernels.h"`， 该文件是编译时自动生成的。
+直接从缓存中获取默认最优的函数。
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
+    seqpool_func(src_data, dst_data, &attr);
+```
+跑一遍所有实现，并输出实现类别。
+```cpp
+    using T = float;
+    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
+    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
+    for (auto f : funcs) {
+        LOG(INFO) << "Kernel implementation type: " << f.first;
+        f.second(src_data, dst_data, &attr);
+    }
+```
 ## 测试
 - 逻辑测试
    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
 - 性能测试
-    所有实现的性能对比，并且与最终的`jit::Get`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
+    所有实现的性能对比，并且与最终的`jit::GetDefaultBestFunc`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
 # 如何添加新的算子
- 在`KernelType` 中添加 `your_key` .
+1. 在`KernelType` 中添加 `your_key` 。
- 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel.
+2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
- (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
+3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
- (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
+4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
- 必要时可以添加新的`KernelTuples`，可以参考`XYZNTuples`，新加的Attr类型需要特例化`JitCodeKey`方法。
+5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。
- 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
+6. 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
- 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`jit::Get`得到的实现一直是速度最快的。
+7. 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。
 # 优点
- 统一的Get方法，接口简单。
+- 接口方便，灵活调用。
 - 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
 - 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
 - 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -88,4 +88,5 @@ REGISTER_OP_CPU_KERNEL(
    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -64,4 +64,5 @@ REGISTER_OP_CPU_KERNEL(
    load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -33,7 +33,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
    auto table_dims = ctx->GetInputDim("W");
    auto ids_dims = ctx->GetInputDim("Ids");
    int ids_rank = ids_dims.size();
+    VLOG(5) << "ids rank is " << ids_rank << std::endl;
    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
                      "The last dimension of the 'Ids' tensor must be 1.");

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -322,14 +322,10 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
  }
 };
-class ReshapeOpInplaceInToOut : public framework::InplaceInToOut {
+class ReshapeOpInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    std::unordered_map<std::string, std::string> inplace_in_to_out = {
        {"X", "Out"},
    };
@@ -337,13 +333,10 @@ class ReshapeOpInplaceInToOut : public framework::InplaceInToOut {
  }
 };
-class ReshapeGradInplaceInToOut : public framework::InplaceInToOut {
+class ReshapeGradInplaceInToOut : public framework::InplaceOpInference {
-  using InplaceInToOut::InplaceInToOut;
+ public:
+  std::unordered_map<std::string, std::string> operator()(
- protected:
+      const framework::OpDesc &op_desc) const override {
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc &op_desc,
-      framework::BlockDesc *block) const override {
    std::unordered_map<std::string, std::string> inplace_in_to_out = {
        {framework::GradVarName("Out"), framework::GradVarName("X")},
    };

--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -19,11 +19,27 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 class SaveCombineOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {}
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+  // TODO(lujun): The override here is just to bypass transform
+  //  in operator impl, which is not elegant enough.
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
@@ -54,7 +70,7 @@ to a file on disk.
        "(string)"
        "The \"file_path\" where the LoDTensor variables will be saved.")
        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
+            [](const std::string& path) { return !path.empty(); });
  }
 };
@@ -70,5 +86,4 @@ REGISTER_OP_CPU_KERNEL(
    save_combine,
    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>);
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
@@ -20,6 +20,4 @@ REGISTER_OP_CUDA_KERNEL(
    save_combine,
    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>);
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/softmax_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -199,14 +201,10 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
  }
 };
-class SoftmaxInplaceInToOut : public framework::InplaceInToOut {
+class SoftmaxInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const framework::OpDesc& op_desc,
-      framework::BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {"X", "Out"},
    };

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -212,6 +212,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
    : workspace_(nullptr), stream_(stream), place_(place) {
+  PADDLE_ENFORCE(cudaSetDevice(place_.device));
  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
 }
@@ -252,10 +253,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
 #endif
  }
-  if (dynload::HasCUDNN()) {
-    cudnn_holder_.reset(new CudnnHolder(&stream_, place));
-  }
  driver_version_ = GetCUDADriverVersion(place_.device);
  runtime_version_ = GetCUDARuntimeVersion(place_.device);
@@ -348,12 +345,21 @@ bool CUDADeviceContext::tensor_core_available() const {
  return cublas_tensor_core_handle_ != nullptr;
 }
+CudnnHolder* CUDADeviceContext::cudnn_holder() const {
+  std::call_once(init_cudnn_, [&]() {
+    if (dynload::HasCUDNN()) {
+      cudnn_holder_.reset(new CudnnHolder(&stream_, place_));
+    }
+  });
+  return cudnn_holder_.get();
+}
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
-  return cudnn_holder_->cudnn_handle();
+  return cudnn_holder()->cudnn_handle();
 }
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
-  return CudnnWorkspaceHandle(cudnn_holder_.get());
+  return CudnnWorkspaceHandle(cudnn_holder());
 }
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -292,9 +292,11 @@ class CUDADeviceContext : public DeviceContext {
 private:
  CUDAPlace place_;
+  mutable std::once_flag init_cudnn_;
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  std::unique_ptr<CudnnHolder> cudnn_holder_;
+  mutable std::unique_ptr<CudnnHolder> cudnn_holder_;
  cudaStream_t stream_;
  std::unique_ptr<CublasHandleHolder> cublas_handle_;
@@ -317,6 +319,7 @@ class CUDADeviceContext : public DeviceContext {
  // StreamCallbackManager is thread-safe
  std::unique_ptr<StreamCallbackManager> callback_manager_;
+  CudnnHolder* cudnn_holder() const;
  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -242,6 +242,7 @@ PYBIND11_MODULE(core, m) {
            self.forward_id_ = forward_id;
          },
          py::return_value_policy::reference)
+      .def_property_readonly("type", &imperative::OpBase::Type)
      .def_property(
          "backward_id",
          [](const imperative::OpBase &self) { return self.backward_id_; },
@@ -355,7 +356,8 @@ PYBIND11_MODULE(core, m) {
      .def("_set_double_element", TensorSetElement<double>)
      .def("_get_double_element", TensorGetElement<double>)
      .def("_place", [](Tensor &self) { return self.place(); })
-      .def("_dtype", [](Tensor &self) { return self.type(); });
+      .def("_dtype", [](Tensor &self) { return self.type(); })
+      .def("__getitem__", PySliceTensor, py::return_value_policy::reference);
  py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
    LoDTensor is a Tensor with optional LoD information.
@@ -507,6 +509,13 @@ PYBIND11_MODULE(core, m) {
           Returns:
               out (bool): whether the lod is valid.
+           )DOC")
+      .def("__getitem__", PySliceTensor, py::return_value_policy::reference,
+           R"DOC(
+           Slice the original Tensor, and remove the LoD information.
+           Returns:
+               out (Tensor): new Tensor(NOT LoDTensor).
           )DOC");
  py::class_<SelectedRows>(m, "SelectedRows")
@@ -779,7 +788,11 @@ All parameter, weight, gradient are variables in Paddle.
 #if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
-  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
+  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
+    CUDAPlace is a descriptor of a device. It represents a GPU, and each CUDAPlace
+    has a dev_id to indicate the number of cards represented by the current CUDAPlace.
+    The memory of CUDAPlace with different dev_id is not accessible.
+        )DOC")
      .def("__init__",
           [](platform::CUDAPlace &self, int dev_id) {
 #ifdef PADDLE_WITH_CUDA
@@ -800,7 +813,10 @@ All parameter, weight, gradient are variables in Paddle.
           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
      .def("__str__", string::to_string<const platform::CUDAPlace &>);
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
+    CPUPlace is a descriptor of a device. It represents a CPU, and the memory
+    CPUPlace can be accessed by CPU.
+        )DOC")
      .def(py::init<>())
      .def("_type", &PlaceIndex<platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
@@ -810,7 +826,10 @@ All parameter, weight, gradient are variables in Paddle.
           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
      .def("__str__", string::to_string<const platform::CPUPlace &>);
-  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
+  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
+    CUDAPinnedPlace is a descriptor of a device. The memory of CUDAPinnedPlace
+    can be accessed by GPU and CPU.
+        )DOC")
      .def("__init__",
           [](platform::CUDAPinnedPlace &self) {
 #ifndef PADDLE_WITH_CUDA

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -14,16 +14,22 @@ limitations under the License. */
 #pragma once
 #include <Python.h>
+#include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
+namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 namespace details {
@@ -191,6 +197,253 @@ inline void PyCPUTensorSetFromArray(
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
 }
+template <typename T, size_t D>
+void _sliceCompute(const framework::Tensor *in, framework::Tensor *out,
+                   const platform::CPUDeviceContext &ctx,
+                   const std::vector<int> &axes,
+                   const std::vector<int> &starts) {
+  auto &eigen_place = *ctx.eigen_device();
+  auto place = in->place();
+  auto out_dims = out->dims();
+  auto in_dims = in->dims();
+  auto offsets = Eigen::array<int, D>();
+  auto extents = Eigen::array<int, D>();
+  for (size_t i = 0; i < D; ++i) {
+    offsets[i] = 0;
+    extents[i] = out_dims[i];
+  }
+  int start;
+  for (size_t i = 0; i < axes.size(); ++i) {
+    start = starts[i];
+    if (start < 0) {
+      start = (start + in_dims[axes[i]]);
+    }
+    start = std::max(start, 0);
+    offsets[axes[i]] = start;
+  }
+  auto in_t =
+      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+          *in);
+  auto out_t =
+      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+          *out);
+  out_t.device(eigen_place) = in_t.slice(offsets, extents);
+}
+template <typename T>
+void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
+                    paddle::framework::Tensor *out,
+                    const platform::CPUDeviceContext &ctx, int64_t axis) {
+  if (axis == 0 && ins.size() < 10) {
+    size_t output_offset = 0;
+    for (auto &in : ins) {
+      auto in_stride = framework::stride_numel(in.dims());
+      auto out_stride = framework::stride_numel(out->dims());
+      paddle::operators::StridedNumelCopyWithAxis<T>(
+          ctx, axis, out->data<T>() + output_offset, out_stride, in.data<T>(),
+          in_stride, in_stride[axis]);
+      output_offset += in_stride[axis];
+    }
+  } else {
+    paddle::operators::math::ConcatFunctor<platform::CPUDeviceContext, T>
+        concat_functor;
+    concat_functor(ctx, ins, static_cast<int>(axis), out);
+  }
+}
+void _getSliceinfo(const framework::Tensor &self, py::object obj,
+                   const int64_t dim, int64_t *pstart, int64_t *pstop,
+                   int64_t *pstep, int64_t *pslicelength) {
+  auto &start = *pstart;
+  auto &stop = *pstop;
+  auto &step = *pstep;
+  auto &slicelength = *pslicelength;
+  const framework::DDim &srcDDim = self.dims();
+  if (dim < 0 || dim >= srcDDim.size()) {
+    throw py::index_error();
+  }
+  if (py::isinstance<py::slice>(obj)) {
+    size_t lstart, lstop, lstep, lslicelength;
+    py::slice s = static_cast<py::slice>(obj);
+    if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
+      throw py::index_error();
+    }
+    start = static_cast<int64_t>(lstart);
+    stop = static_cast<int64_t>(lstop);
+    step = static_cast<int64_t>(lstep);
+    slicelength = static_cast<int64_t>(lslicelength);
+  } else if (py::isinstance<py::int_>(obj)) {
+    start = static_cast<int64_t>(static_cast<py::int_>(obj));
+    if (std::abs(start) >= srcDDim[dim]) {
+      throw py::index_error();
+    }
+    start = (start >= 0) ? start : srcDDim[dim] - start;
+    stop = start + 1;
+    step = 1;
+    slicelength = 1;
+  } else {
+    throw py::index_error();
+  }
+}
+inline framework::Tensor *_getTensor(const framework::Tensor &self,
+                                     const framework::DDim &ddim) {
+  framework::Tensor *output = new framework::Tensor();
+  output->Resize(ddim);
+  auto place = self.place();
+  if (platform::is_cpu_place(place)) {
+    output->mutable_data(boost::get<platform::CPUPlace>(place), self.type());
+#ifdef PADDLE_WITH_CUDA
+  } else {
+    if (platform::is_cuda_pinned_place(place)) {
+      output->mutable_data(boost::get<platform::CUDAPinnedPlace>(place),
+                           self.type());
+    } else if ((platform::is_gpu_place(place))) {
+      output->mutable_data(boost::get<platform::CUDAPlace>(place), self.type());
+    }
+#endif
+  }
+  return output;
+}
+template <typename T>
+void _sliceDapper(const framework::Tensor *in, framework::Tensor *out,
+                  const platform::CPUDeviceContext &ctx,
+                  const std::vector<int> &axes, const std::vector<int> &starts,
+                  int size) {
+  switch (size) {
+    case 1:
+      _sliceCompute<T, 1>(in, out, ctx, axes, starts);
+      break;
+    case 2:
+      _sliceCompute<T, 2>(in, out, ctx, axes, starts);
+      break;
+    case 3:
+      _sliceCompute<T, 3>(in, out, ctx, axes, starts);
+      break;
+    case 4:
+      _sliceCompute<T, 4>(in, out, ctx, axes, starts);
+      break;
+    case 5:
+      _sliceCompute<T, 5>(in, out, ctx, axes, starts);
+      break;
+    case 6:
+      _sliceCompute<T, 6>(in, out, ctx, axes, starts);
+      break;
+    case 7:
+      _sliceCompute<T, 7>(in, out, ctx, axes, starts);
+      break;
+    case 8:
+      _sliceCompute<T, 8>(in, out, ctx, axes, starts);
+      break;
+    case 9:
+      _sliceCompute<T, 9>(in, out, ctx, axes, starts);
+      break;
+    default:
+      PADDLE_THROW("dim size not exepected, current is %d", size);
+      break;
+  }
+}
+template <typename T>
+inline framework::Tensor *_sliceWrapper(const framework::Tensor &self,
+                                        const platform::CPUDeviceContext &ctx,
+                                        py::object obj, int dim, int64_t start,
+                                        int64_t slicelength) {
+  framework::DDim dstDDim = self.dims();
+  dstDDim[dim] = static_cast<int64_t>(slicelength);
+  std::vector<int> axes({dim});
+  std::vector<int> starts({static_cast<int>(start)});
+  framework::Tensor *output = _getTensor(self, dstDDim);
+  _sliceDapper<T>(&self, output, ctx, axes, starts, dstDDim.size());
+  return output;
+}
+template <typename T>
+inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self,
+                                          py::object obj, int dim) {
+  platform::CPUDeviceContext ctx;
+  int64_t start, stop, step, slicelength;
+  _getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength);
+  if (step == 1 || slicelength == 1) {
+    return _sliceWrapper<T>(self, ctx, obj, dim, start, slicelength);
+  } else {
+    std::vector<framework::Tensor> ins;
+    for (auto i = 0; i < slicelength; ++i, start += step) {
+      ins.emplace_back(*_sliceWrapper<T>(self, ctx, obj, dim, start, 1));
+    }
+    // do the concat operation
+    framework::DDim dstDDim = self.dims();
+    dstDDim[dim] = static_cast<int64_t>(slicelength);
+    framework::Tensor *output1 = _getTensor(self, dstDDim);
+    _concatCompute<T>(ins, output1, ctx, dim);
+    return output1;
+  }
+}
+inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
+                                       py::object obj, int dim) {
+  auto src_type = self.type();
+  switch (src_type) {
+    case framework::proto::VarType::FP16:
+      return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
+    case framework::proto::VarType::FP32:
+      return _sliceAndConcat<float>(self, obj, dim);
+    case framework::proto::VarType::FP64:
+      return _sliceAndConcat<double>(self, obj, dim);
+    case framework::proto::VarType::INT32:
+      return _sliceAndConcat<int>(self, obj, dim);
+    case framework::proto::VarType::INT64:
+      return _sliceAndConcat<int64_t>(self, obj, dim);
+    case framework::proto::VarType::BOOL:
+      return _sliceAndConcat<bool>(self, obj, dim);
+    case framework::proto::VarType::INT16:
+      return _sliceAndConcat<bool>(self, obj, dim);
+    case framework::proto::VarType::UINT8:
+      return _sliceAndConcat<bool>(self, obj, dim);
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+inline framework::Tensor *_pySliceTensor(const framework::Tensor &self,
+                                         py::object obj) {
+  if (py::isinstance<py::tuple>(obj)) {
+    py::list l = static_cast<py::list>(obj);
+    std::unique_ptr<framework::Tensor> target;
+    framework::Tensor *src = const_cast<framework::Tensor *>(&self);
+    for (auto i = 0; i < static_cast<int>(l.size()); ++i) {
+      src = _sliceTensor(*src, l[i], i);
+      if (i + 1 == static_cast<int>(l.size())) {
+        return src;
+      } else {
+        target.reset(src);
+      }
+    }
+    return nullptr;
+  } else {
+    return _sliceTensor(self, obj, 0);
+  }
+}
+inline framework::Tensor *PySliceTensor(const framework::Tensor &self,
+                                        py::object obj) {
+  if (platform::is_gpu_place(self.place())) {
+    std::unique_ptr<framework::Tensor> holder;
+    framework::Tensor src;
+    framework::TensorCopySync(self, platform::CPUPlace(), &src);
+    framework::Tensor *output = _pySliceTensor(src, obj);
+    holder.reset(output);
+    framework::Tensor *dst = _getTensor(*output, output->dims());
+    framework::TensorCopySync(*output, self.place(), dst);
+    return dst;
+  } else {
+    return _pySliceTensor(self, obj);
+  }
+}
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -15,7 +15,7 @@
 WMT14 dataset.
 The original WMT14 dataset is too large and a small set of data for set is
 provided. This module will download dataset from
-http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 """

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -59,13 +59,14 @@ from .parallel_executor import *
 from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
+from . import install_check
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + \
    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
    parallel_executor.__all__ + lod_tensor.__all__ + \
-    data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__ + [
+    data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__  + [
        'io',
        'initializer',
        'layers',
@@ -91,6 +92,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'unique_name',
        'recordio_writer',
        'Scope',
+        'install_check',
    ]

--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -65,7 +65,7 @@ Please note that [full ImageNet validation dataset](http://www.image-net.org/cha
 Notes:
 * The accuracy measurement requires the model with `label`.
-* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `providing a theoretical peak compute gain of 4x int8 OPS over fp32 OPS` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). However, the actual test results at the model level will be less than 4X, and in general the average is about 2X. In addition, the calculation library optimization of batch size 1 is not as good as the large batch size.
+* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `The theoretical peak compute gains are 4x int8 OPS over fp32 OPS.` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). Therefore, op-level gain is 4X and topology-level is smaller.
 ## 4. How to reproduce the results
 * Small dataset (Single core)

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -14,15 +14,10 @@
 import collections
 import numpy as np
-import six
 from ..... import compat as cpt
 from .... import core
-from .... import Executor
 from ....framework import IrGraph
 from ....framework import IrNode
-from ....framework import Program
-from ....initializer import Constant
-from ....initializer import NumpyArrayInitializer
 from .... import unique_name
 __all__ = [
@@ -107,7 +102,6 @@ class QuantizationTransformPass(object):
        self._window_size = window_size
        self._moving_rate = moving_rate
-        self._need_initialized = collections.OrderedDict()
        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
        self._conv_ops = ['conv2d', 'depthwise_conv2d']
        self._quantizable_grad_ops = [
@@ -127,7 +121,8 @@ class QuantizationTransformPass(object):
        """
        assert isinstance(graph,
                          IrGraph), 'graph must be the instance of IrGraph.'
-        self._need_initialized.clear()
+        #sequential_execution = core.get_pass('sequential_execution_pass')
+        #sequential_execution.apply(graph.graph)
        self._is_test = graph.is_test()
        # marked the variable which has been dequantized.
        dequantized_vars = collections.OrderedDict()
@@ -135,6 +130,8 @@ class QuantizationTransformPass(object):
        def _transform_forward(graph, op):
            for var_node in op.inputs:
+                if var_node.name() not in op.input_arg_names():
+                    continue
                if var_node.name() in dequantized_vars:
                    dequant_var_node = dequantized_vars[var_node.name()]
                else:
@@ -168,6 +165,8 @@ class QuantizationTransformPass(object):
        def _transform_backward(graph, op):
            no_dequanted_input_vars = True
            for var_node in op.inputs:
+                if var_node.name() not in op.input_arg_names():
+                    continue
                if var_node.name() in dequantized_vars:
                    dequant_var_node = dequantized_vars[var_node.name()]
                    graph.update_input_link(var_node, dequant_var_node, op)
@@ -188,25 +187,7 @@ class QuantizationTransformPass(object):
        for op in ops:
            if op.name() in self._quantizable_grad_ops:
                _transform_backward(graph, op)
+        graph.resolve_hazard()
-        if len(self._need_initialized) > 0:
-            assert self._scope is not None, \
-            'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
-            assert self._place is not None, \
-            'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
-            init_program = Program()
-            for var_desc, initializer in six.iteritems(self._need_initialized):
-                var = init_program.global_block().create_var(
-                    name=var_desc.name(),
-                    shape=var_desc.shape(),
-                    dtype=var_desc.dtype(),
-                    type=var_desc.type(),
-                    lod_level=var_desc.lod_level(),
-                    persistable=var_desc.persistable())
-                initializer(var, init_program.global_block())
-            exe = Executor(self._place)
-            exe.run(program=init_program, scope=self._scope)
        return graph
    def _create_global_step(self, graph):
@@ -222,8 +203,9 @@ class QuantizationTransformPass(object):
                    var_type=core.VarDesc.VarType.LOD_TENSOR,
                    shape=[1],
                    var_dtype=core.VarDesc.VarType.INT64)
-                self._need_initialized[global_step_in.var()] = \
+                self._init_var_node(
-                    Constant(value=0, force_cpu=True)
+                    global_step_in, np.zeros(
+                        [1], dtype='int64'))
                global_step_out = graph.create_var_node_from_desc(
                    global_step_in.var())
                # The attribute of `op_role` is needed by ParallelExecutor.
@@ -300,7 +282,9 @@ class QuantizationTransformPass(object):
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[1],
            var_dtype=var_node.dtype())
-        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type))
        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
        inputs = {'X': var_node, 'InScale': scale_in_node}
@@ -313,7 +297,11 @@ class QuantizationTransformPass(object):
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                shape=[self._window_size],
                var_dtype=var_node.dtype())
-            self._need_initialized[scales_node.var()] = Constant(value=0)
+            data_type = 'float64' if var_node.dtype(
+            ) == core.VarDesc.VarType.FP64 else 'float32'
+            self._init_var_node(
+                scales_node, np.zeros(
+                    [self._window_size], dtype=data_type))
            inputs['Iter'] = self._global_step
            outputs['OutScales'] = scales_node
        attrs = {
@@ -353,7 +341,9 @@ class QuantizationTransformPass(object):
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[1],
            var_dtype=var_node.dtype())
-        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type))
        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
        ins = {'X': var_node, 'InScale': scale_in_node}
@@ -364,13 +354,15 @@ class QuantizationTransformPass(object):
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                var_dtype=var_node.dtype(),
                shape=[1])
-            self._need_initialized[state_in_node.var()] = Constant(value=1)
+            data_type = 'float64' if var_node.dtype(
+            ) == core.VarDesc.VarType.FP64 else 'float32'
+            self._init_var_node(scale_in_node, np.ones([1], dtype=data_type))
            accum_in_node = graph.create_persistable_node(
                name=unique_name.generate('accum'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                var_dtype=var_node.dtype(),
                shape=[1])
-            self._need_initialized[accum_in_node.var()] = Constant(value=1)
+            self._init_var_node(accum_in_node, np.ones([1], dtype=data_type))
            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
            ))
            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
@@ -490,6 +482,16 @@ class QuantizationTransformPass(object):
        graph.link_to(dequant_op_node, dequant_var_node)
        return dequant_var_node
+    def _init_var_node(self, var_node, value):
+        assert isinstance(
+            value, np.ndarray), 'The type of value should be numpy array.'
+        assert self._scope is not None, \
+        'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
+        assert self._place is not None, \
+        'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
+        tensor = self._scope.var(var_node.name()).get_tensor()
+        tensor.set(value, self._place)
    def _quantized_var_name(self, var_name):
        """
        Return quantized variable name for the input `var_name`.
@@ -592,7 +594,8 @@ class QuantizationFreezePass(object):
                                                    self._weight_bits)
                    self._restore_var(input_arg_name, quantized_param_v)
                else:
-                    scale_v = graph.var_node(op_node.output('OutScale')[0])
+                    scale_v = self._to_node(op_node.outputs,
+                                            op_node.output('OutScale')[0])
                    self._var_scale_map[input_arg_name] = scale_v
        ops = graph.all_op_nodes()
@@ -613,32 +616,35 @@ class QuantizationFreezePass(object):
        for op_node in ops:
            # insert dequant_op after fc/conv, need to rename inputs of the followed ops
            for var_node in op_node.inputs:
-                name = var_node.name()
+                if var_node.node in self._op_output_rename_map:
-                if name in self._op_output_rename_map:
+                    old_in = var_node
-                    old_in = graph.var_node(name)
+                    new_in = self._op_output_rename_map[var_node.node]
-                    new_in = self._op_output_rename_map[name]
                    graph.update_input_link(old_in, new_in, op_node)
        # remove the unused var node in the graph
        self._remove_unused_var_nodes(graph)
+        graph.resolve_hazard()
        return graph
    def _remove_fake_quant_and_dequant_op(self, graph, op_node):
-        k = op_node.output('Out')[0]
+        k = self._to_node(op_node.outputs, op_node.output('Out')[0])
-        v = op_node.input('X')[0]
+        v = self._to_node(op_node.inputs, op_node.input('X')[0])
-        if v not in self._op_input_rename_map:
+        if v.node not in self._op_input_rename_map:
-            self._op_input_rename_map[k] = v
+            self._op_input_rename_map[k.node] = v
        else:
-            self._op_input_rename_map[k] = self._op_input_rename_map[v]
+            self._op_input_rename_map[k.node] = self._op_input_rename_map[
+                v.node]
        graph.safe_remove_nodes(op_node)
    def _insert_post_channel_dequant_op(self, graph, op_node):
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
        for var_node in op_node.inputs:
            name = var_node.name()
-            if name in self._op_input_rename_map:
+            if name not in op_node.input_arg_names():
-                old_in = graph.var_node(name)
+                continue
-                new_in = graph.var_node(self._op_input_rename_map[name])
+            if var_node.node in self._op_input_rename_map:
+                old_in = var_node
+                new_in = self._op_input_rename_map[var_node.node]
                new_in.clear_outputs()
                graph.update_input_link(old_in, new_in, op_node)
            original_var_name = self._original_var_name(name)
@@ -653,28 +659,20 @@ class QuantizationFreezePass(object):
                assert isinstance(scale_v, IrNode)
                scale_var_node = self._var_scale_map[original_var_name]
-        if len(op_node.outputs) != 1:
+        if len(op_node.output_arg_names()) != 1:
            raise ValueError("Only support one output, but op %s has"
                             " more than one output." % (op_node.name()))
-        output_var_node = op_node.outputs[0]
+        output_var_node = self._to_node(op_node.outputs,
+                                        op_node.output_arg_names()[0])
        weight_scale_node = graph.create_persistable_node(
            name=unique_name.generate('channel_scale'),
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[channel_scale.shape[0]],
            var_dtype=output_var_node.dtype())
-        init_program = Program()
+        data_type = 'float64' if output_var_node.dtype(
-        weight_scale_var = init_program.global_block().create_var(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
-            name=weight_scale_node.name(),
+        self._init_var_node(weight_scale_node, channel_scale.astype(data_type))
-            shape=weight_scale_node.shape(),
-            dtype=weight_scale_node.dtype(),
-            type=weight_scale_node.type(),
-            lod_level=weight_scale_node.var().lod_level(),
-            persistable=weight_scale_node.persistable())
-        initializer = NumpyArrayInitializer(value=channel_scale)
-        initializer(weight_scale_var, init_program.global_block())
-        exe = Executor(self._place)
-        exe.run(program=init_program, scope=self._scope)
        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(output_var_node.name()),
            var_type=output_var_node.type(),
@@ -695,16 +693,18 @@ class QuantizationFreezePass(object):
        graph.link_to(scale_var_node, dequant_op_node)
        graph.link_to(weight_scale_node, dequant_op_node)
        graph.link_to(dequant_op_node, dequant_var_node)
-        self._op_output_rename_map[output_var_node.name()] = dequant_var_node
+        self._op_output_rename_map[output_var_node.node] = dequant_var_node
        return dequant_var_node
    def _insert_post_dequant_op(self, graph, op_node):
        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
        for var_node in op_node.inputs:
            name = var_node.name()
-            if name in self._op_input_rename_map:
+            if name not in op_node.input_arg_names():
-                old_in = graph.var_node(name)
+                continue
-                new_in = graph.var_node(self._op_input_rename_map[name])
+            if var_node.node in self._op_input_rename_map:
+                old_in = var_node
+                new_in = self._op_input_rename_map[var_node.node]
                new_in.clear_outputs()
                graph.update_input_link(old_in, new_in, op_node)
            original_var_name = self._original_var_name(name)
@@ -720,11 +720,12 @@ class QuantizationFreezePass(object):
                assert isinstance(scale_v, IrNode)
                scale_var_node = self._var_scale_map[original_var_name]
-        if len(op_node.outputs) != 1:
+        if len(op_node.output_arg_names()) != 1:
            raise ValueError("Only support one output, but op %s has"
                             " more than one output." % (op_node.name()))
-        output_var_node = op_node.outputs[0]
+        output_var_node = self._to_node(op_node.outputs,
+                                        op_node.output_arg_names()[0])
        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(output_var_node.name()),
            var_type=output_var_node.type(),
@@ -742,9 +743,27 @@ class QuantizationFreezePass(object):
        graph.link_to(output_var_node, dequant_op_node)
        graph.link_to(scale_var_node, dequant_op_node)
        graph.link_to(dequant_op_node, dequant_var_node)
-        self._op_output_rename_map[output_var_node.name()] = dequant_var_node
+        self._op_output_rename_map[output_var_node.node] = dequant_var_node
        return dequant_var_node
+    def _init_var_node(self, var_node, value):
+        assert isinstance(
+            value, np.ndarray), 'The type of value should be numpy array.'
+        assert self._scope is not None, \
+        'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
+        assert self._place is not None, \
+        'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
+        tensor = self._scope.var(var_node.name()).get_tensor()
+        tensor.set(value, self._place)
+    def _to_node(self, nodes, node_name):
+        target_node = None
+        for n in nodes:
+            if n.name() == node_name:
+                target_node = n
+        assert target_node is not None, "Cannot find the target node in the giving set."
+        return target_node
    def _load_var(self, name):
        return np.array(self._scope.find_var(name).get_tensor())
@@ -848,6 +867,7 @@ class ConvertToInt8Pass(object):
        # remove the unused var node in the graph
        self._remove_unused_var_nodes(graph)
+        graph.resolve_hazard()
        return graph
    def _convert_to_int8(self, graph, var_node):
@@ -930,5 +950,5 @@ class TransformForMobilePass(object):
                for output_node in op_node.outputs:
                    graph.link_to(dequant_node, output_node)
                graph.safe_remove_nodes(op_node)
+        graph.resolve_hazard()
        return graph
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
@@ -86,7 +86,11 @@ class TestGraphWrapper(unittest.TestCase):
    def test_all_vars(self):
        self.build_program()
-        self.assertEquals(len(self.train_graph.vars()), 90)
+        # self.assertEquals(len(self.train_graph.vars()), 90)
+        # activation inplace has been disabled in python side
+        # which may produce more variable in program_desc
+        # update 90 => 94
+        self.assertEquals(len(self.train_graph.vars()), 94)
    def test_numel_params(self):
        self.build_program()

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -627,6 +627,183 @@ class Variable(object):
        """
        self.error_clip = error_clip
+    def _slice_indices(self, slice, length):
+        """
+        Reference implementation for the slice.indices method.
+        """
+        # Compute step and length as integers.
+        step = 1 if slice.step is None else slice.step
+        # Raise ValueError for negative length or zero step.
+        if length < 0:
+            raise ValueError("length should not be negative")
+        if step == 0:
+            raise ValueError("slice step cannot be zero")
+        # Find lower and upper bounds for start and stop.
+        lower = -1 if step < 0 else 0
+        upper = length - 1 if step < 0 else length
+        # Compute start.
+        if slice.start is None:
+            start = upper if step < 0 else lower
+        else:
+            start = slice.start
+            start = max(start + length, lower) if start < 0 else min(start,
+                                                                     upper)
+        # Compute stop.
+        if slice.stop is None:
+            stop = lower if step < 0 else upper
+        else:
+            stop = slice.stop
+            stop = max(stop + length, lower) if stop < 0 else min(stop, upper)
+        return start, stop, step
+    def _detectEllipsis(self, item):
+        has_ellipsis = False
+        start = 0
+        end = len(self.shape)
+        for index, o in enumerate(item):
+            if o is Ellipsis:
+                if has_ellipsis:
+                    raise ValueError("Index can have one ellipsis only.")
+                has_ellipsis = True
+                start = index
+            else:
+                if has_ellipsis:
+                    end = index
+        return has_ellipsis, start, end
+    def _reconstructSliceinfo(self, item):
+        has_ellipsis, start, end = self._detectEllipsis(item)
+        if has_ellipsis:
+            newitem = []
+            for i in range(start):
+                newitem.append(item[i])
+            for i in range(start, end):
+                newitem.append(slice(None, None, None))
+            for i in range(end, len(item)):
+                newitem.append(item[i])
+            return newitem
+        else:
+            return None
+    def _detectContinuesSlice(self, item):
+        starts = []
+        ends = []
+        for index, o in enumerate(item):
+            if isinstance(o, int):
+                start = int(o)
+                if (index > 0 and index >= self.shape[index]) \
+                        or (index < 0 and (index + self.shape[index]) < 0):
+                    raise IndexError("invalid index")
+                start = max(start + self.shape[index], 0) if start < 0 else min(
+                    start, self.shape[index])
+                starts.append(start)
+                ends.append(start + 1)
+            elif isinstance(o, slice):
+                start, stop, step = self._slice_indices(o, self.shape[index])
+                if step == 1 or step == -1:
+                    starts.append(start)
+                    ends.append(stop)
+                else:
+                    return False, None
+            else:
+                raise IndexError("Valid index accept int or slice or ellipsis")
+        return True, [starts, ends]
+    def _cloneVar(self, copy=False):
+        if not copy:
+            return self.block.create_var(
+                name=unique_name.generate(".".join(self.name)),
+                dtype=self.dtype,
+                persistable=self.persistable,
+                stop_gradient=self._stop_gradient, )
+        else:
+            return self
+    def _sliceVar(self, axes, starts, ends):
+        new_var = self._cloneVar()
+        self.block.append_op(
+            type="slice",
+            inputs={'Input': [self]},
+            outputs={'Out': [new_var]},
+            attrs={'axes': axes,
+                   'starts': starts,
+                   'ends': ends})
+        return new_var
+    def _concatVar(self, inputs, axis):
+        new_var = self._cloneVar()
+        self.block.append_op(
+            type="concat",
+            inputs={'X': inputs},
+            outputs={'Out': [new_var]},
+            attrs={'axis': axis, })
+        return new_var
+    def _sliceAndConcatVar(self, item, axis):
+        if isinstance(item, slice):
+            if self.shape[axis] < 0:
+                return self._cloneVar(True)
+            start, stop, step = self._slice_indices(item, self.shape[axis])
+            if step == 1:
+                return self._sliceVar([axis], [start], [stop])
+            else:
+                vars = []
+                if step > 0:
+                    while start < stop:
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1]))
+                        start += step
+                else:
+                    while start > stop:
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1]))
+                        start += step
+                return self._concatVar(vars, axis)
+        elif isinstance(item, int):
+            if self.shape[axis] < 0:
+                return self._cloneVar(True)
+            index = int(item)
+            if (index > 0 and index >= self.shape[axis])\
+                    or (index < 0 and (index + self.shape[axis]) < 0):
+                raise IndexError("invalid index")
+            return self._sliceVar([axis], [index], [index + 1])
+        else:
+            raise IndexError("Valid index accept int or slice or tuple")
+    def __getitem__(self, item):
+        """
+        Slice the variable.
+        Args:
+            item(int/slice/tuple) : the index.
+        Returns:
+            Sliced variable
+        """
+        new_var = None
+        if isinstance(item, tuple):
+            if len(item) > len(self.shape):
+                raise IndexError("Too many indexes")
+            newitem = self._reconstructSliceinfo(item) or item
+            check, info = self._detectContinuesSlice(newitem)
+            if check:
+                starts = info[0]
+                ends = info[1]
+                axes = [i for i in range(len(starts))]
+                return self._sliceVar(axes, starts, ends)
+            else:
+                new_var = self
+                for index, o in enumerate(newitem):
+                    new_var = new_var._sliceAndConcatVar(o, index)
+        else:
+            new_var = self._sliceAndConcatVar(item, 0)
+        return new_var
 def get_all_op_protos():
    """
@@ -744,7 +921,7 @@ class Operator(object):
        if _in_imperative_mode():
            if type is None:
                raise ValueError(
-                    "`type` to initilized an Operator can not be None.")
+                    "`type` to initialized an Operator can not be None.")
            self.iop = core.OpBase(type)
            # TODO(minqiyang): remove these lines after we take apart all
@@ -906,7 +1083,10 @@ class Operator(object):
    @property
    def type(self):
-        return self.desc.type()
+        if _in_imperative_mode():
+            return self.iop.type
+        else:
+            return self.desc.type()
    def input(self, name):
        """
@@ -2052,6 +2232,28 @@ class IrOpNode(IrNode):
        else:
            desc._set_attr(name, val)
+    def input_arg_names(self):
+        """
+        Return input arguments' names of this op node.
+        Returns:
+            list(str): input arguments' names of this op node.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().input_arg_names()
+    def output_arg_names(self):
+        """
+        Return output arguments' names of this op node.
+        Returns:
+            list(str): output arguments' names of this op node.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().output_arg_names()
    @property
    def inputs(self):
        """
@@ -2142,31 +2344,38 @@ class IrGraph(object):
        """
        return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
-    def var_node(self, name):
+    def _find_var_node(self, key):
        """
-        Get a variable node by name from the graph.
+        Get a variable node by the `key` from this graph. The key
+        can be a node name or a node id.
+        WARNS:
+            There are some nodes may have the same name. So, be
+            cautious about using this method when you find the
+            target var node by its name.
        Args:
-            name(str): the name of the variable node.
+            key(str|int): The str type denotes that the target variable node's name.
+            And the int type denotes that the target variable node's id.
        Raises:
-            ValueError: The If input's type is not str, or this graph
+            ValueError: If this graph doesn't have a variable with the giving name or id.
-            doesn't have a variable with the giving name.
        Returns:
-            IrVarNode: the variable node with the giving name.
+            IrVarNode: the variable node with the giving name or id.
        """
-        if not isinstance(name, six.string_types):
-            raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
        target_var_node = None
        var_nodes = self.all_var_nodes()
-        for var_node in var_nodes:
+        if isinstance(key, six.string_types):
-            if var_node.name() == name:
+            for var_node in var_nodes:
-                target_var_node = var_node
+                if var_node.name() == key:
+                    target_var_node = var_node
+        elif isinstance(key, int):
+            for var_node in var_nodes:
+                if var_node.id() == key:
+                    target_var_node = var_node
        if target_var_node is None:
-            raise ValueError("var_node %s not in this graph" % name)
+            raise ValueError("var_node %s not in this graph" % key)
        return target_var_node
    def create_persistable_node(self, name, var_type, shape, var_dtype):
@@ -2312,6 +2521,34 @@ class IrGraph(object):
        original_nodes = {n.node for n in remove_nodes}
        core.graph_safe_remove_nodes(self.graph, original_nodes)
+    def resolve_hazard(self):
+        def _to_node(nodes, node_name):
+            target_node = None
+            for n in nodes:
+                if n.name() == node_name:
+                    target_node = n
+            assert target_node is not None, "Cannot find the target node in the giving set."
+            return target_node
+        ordered_nodes = core.topology_sort(self.graph)
+        var_nodes = dict()
+        for node in ordered_nodes:
+            if node.is_op() and node.op() is not None:
+                for each_var_name in node.op().input_arg_names():
+                    if each_var_name not in var_nodes:
+                        var_nodes[each_var_name] = [
+                            _to_node(node.inputs, each_var_name)
+                        ]
+                for each_var_name in node.op().output_arg_names():
+                    if each_var_name not in var_nodes:
+                        var_nodes[each_var_name] = [
+                            _to_node(node.outputs, each_var_name)
+                        ]
+                    else:
+                        var_nodes[each_var_name].append(
+                            _to_node(node.outputs, each_var_name))
+        self.graph.resolve_hazard(var_nodes)
    def has_circle(self):
        """
        Check if the graph has a circle.

--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -44,7 +44,7 @@ def guard(place=None):
                    yield
-def to_variable(value, block=None):
+def to_variable(value, block=None, name=None):
    if isinstance(value, np.ndarray):
        assert enabled(), "to_variable could only be called in imperative mode"
@@ -53,9 +53,10 @@ def to_variable(value, block=None):
        py_var = framework.Variable(
            block,
            type=core.VarDesc.VarType.LOD_TENSOR,
-            name=None,
+            name=name,
            shape=value.shape,
-            dtype=value.dtype)
+            dtype=value.dtype,
+            stop_gradient=True)
        var = py_var._ivar.value()
        tensor = var.get_tensor()
        tensor.set(value, framework._current_expected_place())

--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
@@ -105,6 +105,7 @@ class LayerObjectHelper(LayerHelperBase):
        Returns dtype of the input
        """
+        inputs_in = inputs_in if (inputs_in is not None) else []
        inputs = self._multiple_input(inputs_in)
        dtype = None
        for each in inputs:
@@ -191,13 +192,7 @@ class LayerObjectHelper(LayerHelperBase):
            act['use_mkldnn'] = use_mkl_dnn
        act_type = act.pop('type')
-        tmp = input_var
+        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        # NOTE(dzhwinter): some activation support inplace compution.
-        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
-        if not _in_imperative_mode() and core.IsInplace(act_type):
-            tmp = input_var
-        else:
-            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
        self.append_op(
            type=act_type,
            inputs={"X": [input_var]},

--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -17,10 +17,12 @@ import contextlib
 import sys
 import numpy as np
 import collections
+import six
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from paddle.fluid import framework
+from ..param_attr import ParamAttr
 __all__ = ['Layer', 'PyLayer']
@@ -72,6 +74,10 @@ class Layer(core.Layer):
        Returns created parameter Variable.
        """
+        if isinstance(attr, ParamAttr) and (attr.name is not None):
+            attr.name = ".".join([self._full_name, attr.name])
+        elif isinstance(attr, six.string_types):
+            attr = ".".join([self._full_name, attr])
        return self._helper.create_parameter(attr, shape, dtype, is_bias,
                                             default_initializer)
@@ -164,6 +170,7 @@ class Layer(core.Layer):
            the sublayer passed in.
        """
        assert isinstance(sublayer, core.Layer)
        self._sub_layers[name] = sublayer
        return sublayer

--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -15,15 +15,20 @@
 from __future__ import print_function
 from six.moves import reduce
+import numpy as np
 from .. import core
 from ..layers import utils
 from . import layers
 from ..framework import Variable, OpProtoHolder
+from ..layers import layer_function_generator
 from ..param_attr import ParamAttr
-from ..initializer import Normal, Constant
+from ..initializer import Normal, Constant, NumpyArrayInitializer
-__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']
+__all__ = [
+    'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm',
+    'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv'
+]
 class Conv2D(layers.Layer):
@@ -438,7 +443,6 @@ class Embedding(layers.Layer):
        self._size = size
        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
        self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
            size[0] + padding_idx)
@@ -471,6 +475,131 @@ class Embedding(layers.Layer):
        return out
+class LayerNorm(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 scale=True,
+                 shift=True,
+                 begin_norm_axis=1,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None):
+        """
+        ${comment}
+        The formula is as follows:
+        ..  math::
+            \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+            \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+            h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+        * :math:`a`: the vector representation of the summed inputs to the neurons
+        in that layer.
+        * :math:`H`: the number of hidden units in a layers
+        * :math:`g`: the trainable scale parameter.
+        * :math:`b`: the trainable bias parameter.
+        Args:
+            input(Variable): The input tensor variable.
+            scale(bool): Whether to learn the adaptive gain :math:`g` after
+                normalization. Default True.
+            shift(bool): Whether to learn the adaptive bias :math:`b` after
+                normalization. Default True.
+            begin_norm_axis(int): The normalization will be performed along
+                dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+                Default 1.
+            epsilon(float): The small value added to the variance to prevent
+                division by zero. Default 1e-05.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
+                omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
+                a default :code:`ParamAttr` would be added as scale. The
+                :attr:`param_attr` is initialized as 1 if it is added. Default None.
+            bias_attr(ParamAttr|None): The parameter attribute for the learnable
+                bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
+                omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
+                a default :code:`ParamAttr` would be added as bias. The
+                :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+            act(str): Activation to be applied to the output of layer normalizaiton.
+                      Default None.
+        Returns:
+            ${y_comment}
+        Examples:
+            >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+            >>>                          dtype='float32')
+            >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        """
+        super(LayerNorm, self).__init__(name_scope)
+        self._scale = scale
+        self._shift = shift
+        self._begin_norm_axis = begin_norm_axis
+        self._epsilon = epsilon
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        input_shape = input.shape
+        param_shape = [
+            reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:])
+        ]
+        if self._scale:
+            self._scale_w = self.create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+        if self._shift:
+            assert self._bias_attr is not False
+            self._bias_w = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True)
+    def forward(self, input):
+        inputs = dict()
+        inputs['X'] = input
+        if self._scale:
+            inputs['Scale'] = self._scale_w
+        if self._shift:
+            inputs['Bias'] = self._bias_w
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        layer_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="layer_norm",
+            inputs=inputs,
+            outputs={
+                "Y": layer_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "begin_norm_axis": self._begin_norm_axis
+            })
+        return self._helper.append_activation(layer_norm_out)
 class GRUUnit(layers.Layer):
    """
    **GRU unit layer**
@@ -603,3 +732,668 @@ class GRUUnit(layers.Layer):
            })
        return updated_hidden, reset_hidden_pre, gate
+class NCE(layers.Layer):
+    """
+    ${comment}
+    Args:
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
+            storing a weight for each sample. The default weight for each
+            sample is 1.0.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+             of nce. If it is set to None or one attribute of ParamAttr, nce
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
+             If it is set to False, no bias will be added to the output units.
+             If it is set to None or one attribute of ParamAttr, nce
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        num_neg_samples (int): ${num_neg_samples_comment}
+        name (str|None): A name for this layer(optional). If set None, the layer
+             will be named automatically. Default: None.
+        sampler (str): The sampler used to sample class from negtive classes.
+                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
+                       default: 'uniform'.
+        custom_dist (float[]): A float[] with size=num_total_classes.
+                       It is used when sampler is set to 'custom_dist'.
+                       custom_dist[i] is the probsbility of i-th class to be sampled.
+                       default: None.
+        seed (int): The seed used in sampler. default: 0.
+        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
+    Returns:
+        Variable: The output nce loss.
+    Examples:
+        .. code-block:: python
+            window_size = 5
+            words = []
+            for i in xrange(window_size):
+                words.append(layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+            embs = []
+            for i in xrange(window_size):
+                if i == label_word:
+                    continue
+                emb = layers.embedding(input=words[i], size=[dict_size, 32],
+                                       param_attr='emb.w', is_sparse=True)
+                embs.append(emb)
+            embs = layers.concat(input=embs, axis=1)
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=dict_size, param_attr='nce.w',
+                          bias_attr='nce.b')
+            #or use custom distribution
+            dist = fluid.layers.assign(input=np.array([0.05,0.5,0.1,0.3,0.05]).astype("float32"))
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=5, param_attr='nce.w',
+                          bias_attr='nce.b',
+                          num_neg_samples=3,
+                          sampler="custom_dist",
+                          custom_dist=dist)
+    """
+    def __init__(self,
+                 name_scope,
+                 num_total_classes,
+                 param_attr=None,
+                 bias_attr=None,
+                 num_neg_samples=None,
+                 sampler="uniform",
+                 custom_dist=None,
+                 seed=0,
+                 is_sparse=False):
+        super(NCE, self).__init__(name_scope)
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._num_total_classes = num_total_classes
+        self._inputs = dict()
+        if sampler == "uniform":
+            sampler = 0
+        elif sampler == "log_uniform":
+            sampler = 1
+        elif sampler == "custom_dist":
+            assert custom_dist is not None
+            # assert isinstance(custom_dist, Variable)
+            custom_dist_len = len(custom_dist)
+            alias_probs_ = [0] * custom_dist_len
+            alias_ = [0] * custom_dist_len
+            bigs = []
+            littles = []
+            for i in range(custom_dist_len):
+                normal_prob = custom_dist[i] * custom_dist_len
+                if normal_prob - 1.0 > 0:
+                    bigs.append((i, normal_prob))
+                elif 1.0 - normal_prob > 0:
+                    littles.append((i, normal_prob))
+                else:
+                    alias_probs_[i] = normal_prob
+                    alias_[i] = -1
+            while len(bigs) and len(littles):
+                big = bigs.pop(0)
+                little = littles.pop(0)
+                big_idx = big[0]
+                big_prob = big[1]
+                alias_probs_[little[0]] = little[1]
+                alias_[little[0]] = big_idx
+                big_left = big[1] + little[1] - 1
+                if big_left - 1.0 > 0:
+                    bigs.append((big_idx, big_left))
+                elif 1.0 - big_left > 0:
+                    littles.append((big_idx, big_left))
+                else:
+                    alias_probs_[big_idx] = big_left
+                    alias_[big_idx] = -1
+            if len(bigs):
+                big = bigs.pop(0)
+                alias_probs_[big[0]] = 1.0
+                alias_[big[0]] = -1
+            if len(littles):
+                little = littles.pop(0)
+                alias_probs_[little[0]] = 1.0
+                alias_[little[0]] = -1
+            def _init_by_numpy_array(numpy_array):
+                ret = self.create_parameter(
+                    attr=ParamAttr(),
+                    shape=numpy_array.shape,
+                    dtype=numpy_array.dtype,
+                    default_initializer=NumpyArrayInitializer(numpy_array))
+                ret.stop_gradient = True
+                return ret
+            self._inputs['CustomDistProbs'] = _init_by_numpy_array(
+                np.array(custom_dist).astype('float32'))
+            self._inputs['CustomDistAlias'] = _init_by_numpy_array(
+                np.array(alias_).astype('int32'))
+            self._inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
+                np.array(alias_probs_).astype('float32'))
+            sampler = 2
+        else:
+            raise Exception("Unsupported sampler type.")
+        if num_neg_samples is None:
+            num_neg_samples = 10
+        else:
+            num_neg_samples = int(num_neg_samples)
+        self._num_neg_samples = num_neg_samples
+        remote_prefetch = is_sparse
+        print(
+            "With sparse mode, if your models has only small parameter prefetch may cause speed down"
+        )
+        self._attrs = {
+            'num_total_classes': int(num_total_classes),
+            'num_neg_samples': num_neg_samples,
+            'seed': seed,
+            'sampler': sampler,
+            'is_sparse': is_sparse,
+            'remote_prefetch': remote_prefetch
+        }
+    def _build_once(self, input, label, sample_weight=None):
+        assert isinstance(input, Variable)
+        assert isinstance(label, Variable)
+        dim = input.shape[1]
+        num_true_class = label.shape[1]
+        self._w = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._num_total_classes, dim],
+            is_bias=False,
+            dtype=input.dtype)
+        if self._bias_attr:
+            self._b = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[self._num_total_classes, 1],
+                is_bias=True,
+                dtype=input.dtype)
+            self._inputs['Bias'] = self._b
+        self._inputs['Weight'] = self._w
+    def forward(self, input, label, sample_weight=None):
+        assert isinstance(input, Variable)
+        assert isinstance(label, Variable)
+        self._inputs['Input'] = input
+        self._inputs['Label'] = label
+        self._inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
+        cost = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+        sample_logits = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+        sample_labels = self._helper.create_variable_for_type_inference(
+            dtype=label.dtype)
+        self._helper.append_op(
+            type='nce',
+            inputs=self._inputs,
+            outputs={
+                'Cost': cost,
+                'SampleLogits': sample_logits,
+                'SampleLabels': sample_labels
+            },
+            attrs=self._attrs)
+        return cost / (self._num_neg_samples + 1)
+class PRelu(layers.Layer):
+    """
+    Equation:
+    .. math::
+        y = \max(0, x) + \\alpha * \min(0, x)
+    Args:
+        x (Variable): The input tensor.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+          weight (alpha).
+        mode (string): The mode for weight sharing. It supports all, channel
+          and element. all: all elements share same weight
+          channel:elements in a channel share same weight
+          element:each element has a weight
+        name(str|None): A name for this layer(optional). If set None, the layer
+          will be named automatically.
+    Returns:
+        Variable: The output tensor with the same shape as input.
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
+            mode = 'channel'
+            output = fluid.layers.prelu(x,mode)
+    """
+    def __init__(self, name_scope, mode, param_attr=None):
+        super(PRelu, self).__init__(name_scope)
+        self._mode = mode
+        self._param_attr = param_attr
+        if self._mode not in ['all', 'channel', 'element']:
+            raise ValueError('mode should be one of all, channel, element.')
+        self._alpha_shape = [1]
+    def _build_once(self, input):
+        if self._mode == 'channel':
+            self._alpha_shape = [1, input.shape[1], 1, 1]
+        elif self._mode == 'element':
+            self._alpha_shape = input.shape
+        self._dtype = self._helper.input_dtype(input)
+        self._alpha = self.create_parameter(
+            attr=self._param_attr,
+            shape=self._alpha_shape,
+            dtype='float32',
+            is_bias=False,
+            default_initializer=Constant(1.0))
+    def forward(self, input):
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="prelu",
+            inputs={"X": input,
+                    'Alpha': self._alpha},
+            attrs={"mode": self._mode},
+            outputs={"Out": out})
+        return out
+class BilinearTensorProduct(layers.Layer):
+    """
+    **Add Bilinear Tensor Product Layer**
+    This layer performs bilinear tensor product on two inputs.
+    For example:
+    .. math::
+      out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+    In this formula:
+     - :math:`x`: the first input contains M elements, shape is [batch_size, M].
+     - :math:`y`: the second input contains N elements, shape is [batch_size, N].
+     - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
+     - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
+    Args:
+       x (Variable): 2-D input tensor with shape [batch_size, M]
+       y (Variable): 2-D input tensor with shape [batch_size, N]
+       size (int): The dimension of this layer.
+       act (str, default None): Activation to be applied to the output of this layer.
+       name (str, default None): The name of this layer.
+       param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
+           parameters/weights of this layer.
+       bias_attr (ParamAttr, default None): The parameter attribute for the bias
+           of this layer. If it is set to False, no bias will be added to the output units.
+           If it is set to None, the bias is initialized zero. Default: None.
+    Returns:
+       Variable: A 2-D Tensor of shape [batch_size, size].
+    Examples:
+       .. code-block:: python
+         tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000)
+    """
+    def __init__(self,
+                 name_scope,
+                 size,
+                 name=None,
+                 act=None,
+                 param_attr=None,
+                 bias_attr=None):
+        super(BilinearTensorProduct, self).__init__(name_scope)
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+        self._size = size
+        self._name = name
+        self._inputs = dict()
+    def _build_once(self, x, y):
+        self._dtype = self._helper.input_dtype(x)
+        param_shape = [self._size, x.shape[1], y.shape[1]]
+        self._w = self.create_parameter(
+            attr=self._param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+        if self._bias_attr:
+            bias_size = [1, self._size]
+            bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=bias_size,
+                dtype=self._dtype,
+                is_bias=True)
+            self._inputs["Bias"] = bias
+    def forward(self, x, y):
+        self._inputs = {"X": x, "Y": y, "Weight": self._w}
+        if self._name is not None:
+            out = self._helper.create_variable(
+                name=".".join([self.full_name(), self._name]),
+                dtype=self._dtype,
+                persistable=False)
+        else:
+            out = self._helper.create_variable(
+                dtype=self._dtype, persistable=False)
+        self._helper.append_op(
+            type="bilinear_tensor_product",
+            inputs=self._inputs,
+            outputs={"Out": out})
+        # add activation
+        return self._helper.append_activation(out)
+class Conv2DTranspose(layers.Layer):
+    """
+    **Convlution2D transpose layer**
+    The convolution2D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+    Parameters(dilations, strides, paddings) are two elements. These two elements
+    represent height and width, respectively. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). None if use
+            filter_size, padding, and stride to calculate output_size.
+            if output_size and filter_size are specified at the same time, They
+            should follow the formula above.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: True.
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+    Examples:
+       .. code-block:: python
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
+    """
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 output_size=None,
+                 filter_size=None,
+                 padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None):
+        super(Conv2DTranspose, self).__init__(name_scope)
+        assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._groups = groups
+        self._num_filters = num_filters
+        self._use_cudnn = use_cudnn
+        self._padding = padding
+        self._stride = stride
+        self._dilation = dilation
+        self._filter_size = filter_size
+        self._output_size = output_size
+        self._op_type = 'conv2d_transpose'
+    def _build_once(self, input):
+        input_channel = input.shape[1]
+        if (input_channel == self._groups and
+                self._num_filters == input_channel and not self._use_cudnn):
+            self._op_type = 'depthwise_conv2d_transpose'
+        if not isinstance(input, Variable):
+            raise TypeError("Input of conv2d_transpose must be Variable")
+        self._padding = utils.convert_to_list(self._padding, 2, 'padding')
+        self._stride = utils.convert_to_list(self._stride, 2, 'stride')
+        self._dilation = utils.convert_to_list(self._dilation, 2, 'dilation')
+        if not isinstance(self._use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        if self._filter_size is None:
+            if self._output_size is None:
+                raise ValueError(
+                    "output_size must be set when filter_size is None")
+            if isinstance(self._output_size, int):
+                self._output_size = [self._output_size, self._output_size]
+            h_in = input.shape[2]
+            w_in = input.shape[3]
+            filter_size_h = (self._output_size[0] -
+                             (h_in - 1) * self._stride[0] + 2 * self._padding[0]
+                             - 1) // self._dilation[0] + 1
+            filter_size_w = (self._output_size[1] -
+                             (w_in - 1) * self._stride[1] + 2 * self._padding[1]
+                             - 1) // self._dilation[1] + 1
+            self._filter_size = [filter_size_h, filter_size_w]
+        else:
+            self._filter_size = utils.convert_to_list(
+                self._output_size, 2, 'conv2d_transpose.filter_size')
+        if self._output_size is None:
+            self._output_size = []
+        elif isinstance(self._output_size, list) or isinstance(
+                self._output_size, int):
+            self._output_size = utils.convert_to_list(self._output_size, 2,
+                                                      'output_size')
+        else:
+            raise ValueError("output_size should be list or int")
+        self._padding = utils.convert_to_list(self._padding, 2, 'padding')
+        self._groups = 1 if self._groups is None else self._groups
+        filter_shape = [input_channel, self._num_filters // self._groups
+                        ] + self._filter_size
+        self._img_filter = self.create_parameter(
+            dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+        self._helper.append_op(
+            type=self._op_type,
+            inputs={'Input': [input],
+                    'Filter': [self._img_filter]},
+            outputs={'Output': pre_bias},
+            attrs={
+                'output_size': self._output_size,
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups,
+                'use_cudnn': self._use_cudnn
+            })
+        pre_act = self._helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+        out = self._helper.append_activation(pre_act)
+        return out
+class SequenceConv(layers.Layer):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    Args:
+        input (Variable): ${x_comment}
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
+    Returns:
+        Variable: output of sequence_conv
+    """
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 filter_size=3,
+                 filter_stride=1,
+                 padding=None,
+                 bias_attr=None,
+                 param_attr=None,
+                 act=None):
+        super(SequenceConv, self).__init__(name_scope)
+        self._num_filters = num_filters
+        self._filter_size = filter_size
+        self._filter_stride = filter_stride
+        self._padding = padding
+        self._bias_attr = bias_attr
+        self._param_attr = param_attr
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        print(self._filter_size)
+        filter_shape = [self._filter_size * input.shape[1], self._num_filters]
+        self._filter_param = self.create_parameter(
+            attr=self.param_attr, shape=filter_shape, dtype=self._dtype)
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type='sequence_conv',
+            inputs={
+                'X': [input],
+                'Filter': [self._filter_param],
+            },
+            outputs={"Out": pre_bias},
+            attrs={
+                'contextStride': self._filter_stride,
+                'contextStart': -int(self._filter_size // 2),
+                'contextLength': self._filter_size
+            })
+        pre_act = self._helper.append_bias_op(pre_bias)
+        return self._helper.append_activation(pre_act)
--- a/python/paddle/fluid/imperative/tracer.py
+++ b/python/paddle/fluid/imperative/tracer.py
@@ -62,7 +62,7 @@ class Tracer(core.Tracer):
            if len(backward_refs) > 0:
                op.iop.register_backward_hooks(release_op)
-                # TODO(minqiyang): remove all inputs and outputs after seperate
+                # TODO(minqiyang): remove all inputs and outputs after separate
                # var and grad
                op.backward_refs = defaultdict(list)
                for k, v in six.iteritems(op.inputs):

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -212,7 +212,7 @@ class UniformInitializer(Initializer):
        if self._seed == 0:
            self._seed = block.program.random_seed
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
        if var.dtype == VarDesc.VarType.FP16:
            out_dtype = VarDesc.VarType.FP32
            out_var = block.create_var(
@@ -756,7 +756,7 @@ class NumpyArrayInitializer(Initializer):
            values = [int(v) for v in self._value.flat]
        else:
            raise ValueError("Unsupported dtype %s", self._value.dtype)
-        if self._value.size > 1024 * 1024 * 5:
+        if self._value.size > 1024 * 1024 * 1024:
            raise ValueError("The size of input is too big. Please consider "
                             "saving it to file and 'load_op' to load it")
        op = block._prepend_op(

--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .framework import Program, program_guard, unique_name, default_startup_program
+from .param_attr import ParamAttr
+from .initializer import Constant
+from . import layers
+from . import backward
+from .imperative import Layer, nn
+from . import executor
+from . import core
+import numpy as np
+__all__ = ['run_check']
+class SimpleLayer(Layer):
+    def __init__(self, name_scope):
+        super(SimpleLayer, self).__init__(name_scope)
+        self._fc1 = nn.FC(self.full_name(),
+                          3,
+                          ParamAttr(initializer=Constant(value=0.1)))
+    def forward(self, inputs):
+        x = self._fc1(inputs)
+        x = layers.reduce_sum(x)
+        return x
+def run_check():
+    ''' intall check to verify if install is success
+    This func should not be called only if you need to verify installation
+    '''
+    print("Running Verify Fluid Program ... ")
+    prog = Program()
+    startup_prog = Program()
+    scope = core.Scope()
+    with executor.scope_guard(scope):
+        with program_guard(prog, startup_prog):
+            with unique_name.guard():
+                np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+                inp = layers.data(
+                    name="inp", shape=[2, 2], append_batch_size=False)
+                simple_layer = SimpleLayer("simple_layer")
+                out = simple_layer(inp)
+                param_grads = backward.append_backward(
+                    out, parameter_list=[simple_layer._fc1._w.name])[0]
+                exe = executor.Executor(core.CPUPlace(
+                ) if not core.is_compiled_with_cuda() else core.CUDAPlace(0))
+                exe.run(default_startup_program())
+                exe.run(feed={inp.name: np_inp},
+                        fetch_list=[out.name, param_grads[1].name])
+    print(
+        "Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
+    )
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -151,13 +151,7 @@ class LayerHelper(LayerHelperBase):
            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        act_type = act.pop('type')
-        tmp = input_var
+        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        # NOTE(dzhwinter): some activation support inplace compution.
-        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
-        if not _in_imperative_mode() and core.IsInplace(act_type):
-            tmp = input_var
-        else:
-            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
        self.append_op(
            type=act_type,
            inputs={"X": [input_var]},

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -268,11 +268,9 @@ class LayerHelperBase(object):
        """
        # Deepcopy the attr so that parameters can be shared in program
        attr = copy.deepcopy(attr)
-        if attr is None:
+        attr = ParamAttr._to_attr(attr)
-            attr = ParamAttr._to_attr(attr)
        if not attr:
            return None
        assert isinstance(attr, ParamAttr)
        suffix = 'b' if is_bias else 'w'
        if attr.name is None:

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6206,7 +6206,8 @@ def one_hot(input, depth):
        type="one_hot",
        inputs={'X': input},
        attrs={'depth': depth},
-        outputs={'Out': one_hot_out})
+        outputs={'Out': one_hot_out},
+        stop_gradient=True)
    return one_hot_out

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+from six.moves import reduce
 from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -165,6 +165,8 @@ class Optimizer(object):
            name = self._name + "_" + name
        if (name in self._accumulators and
                param.name in self._accumulators[name]):
+            if framework._in_imperative_mode():
+                return self._accumulators[name][param.name]
            raise Exception("Accumulator {} already exists for parameter {}".
                            format(name, param.name))
        if shape == None:
@@ -397,13 +399,14 @@ class Optimizer(object):
            for param in parameters:
                if not param.trainable:
                    continue
-                # create gradient variable
+                if param._ivar._grad_ivar() is not None:
-                grad_var = Variable(
+                    # create gradient variable
-                    block=loss.block,
+                    grad_var = Variable(
-                    name=param._ivar._grad_name(),
+                        block=loss.block,
-                    stop_gradient=True,
+                        name=param._ivar._grad_name(),
-                    ivar=param._ivar._grad_ivar())
+                        stop_gradient=True,
-                params_grads.append((param, grad_var))
+                        ivar=param._ivar._grad_ivar())
+                    params_grads.append((param, grad_var))
            with program_guard(framework.default_main_program(),
                               framework.default_startup_program()):
                optimize_ops = self._create_optimization_pass(params_grads)

--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -68,9 +68,9 @@ class TestDistSaveLoadDense2x2(TestDistBase):
        train0_np = np.array(tr0_var)
        train1_np = np.array(tr1_var)
-        self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
+        np.testing.assert_almost_equal(local_np, train0_np, decimal=2)
-        self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
+        np.testing.assert_almost_equal(local_np, train1_np, decimal=2)
-        self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
+        np.testing.assert_almost_equal(train0_np, train1_np, decimal=2)
    def test_dist(self):
        need_envs = {
@@ -134,10 +134,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
        train0_2_np = np.array(tr0_var_2)
        train1_2_np = np.array(tr1_var_2)
-        self.assertAlmostEqual(
+        np.testing.assert_almost_equal(train0_1_np, train0_2_np, decimal=2)
-            train0_1_np.all(), train0_2_np.all(), delta=delta)
+        np.testing.assert_almost_equal(train1_1_np, train1_2_np, decimal=2)
-        self.assertAlmostEqual(
-            train1_1_np.all(), train1_2_np.all(), delta=delta)
    def test_dist(self):
        need_envs = {

--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import random
+import os
 import sys
 import paddle
@@ -23,16 +24,17 @@ import paddle.fluid.core as core
 from test_imperative_base import new_program_scope
 from paddle.fluid.imperative.base import to_variable
-NUM_USERS = 100
+# Can use Amusic dataset as the DeepCF describes.
-NUM_ITEMS = 1000
+DATA_PATH = os.environ.get('DATA_PATH', '')
-BATCH_SIZE = 32
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 128))
-NUM_BATCHES = 2
+NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
+NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
-class MLP(fluid.imperative.Layer):
+class DMF(fluid.imperative.Layer):
    def __init__(self, name_scope):
-        super(MLP, self).__init__(name_scope)
+        super(DMF, self).__init__(name_scope)
        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
@@ -61,9 +63,9 @@ class MLP(fluid.imperative.Layer):
        return fluid.layers.elementwise_mul(users, items)
-class DMF(fluid.imperative.Layer):
+class MLP(fluid.imperative.Layer):
    def __init__(self, name_scope):
-        super(DMF, self).__init__(name_scope)
+        super(MLP, self).__init__(name_scope)
        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
        self._match_layers = []
@@ -87,21 +89,30 @@ class DMF(fluid.imperative.Layer):
 class DeepCF(fluid.imperative.Layer):
-    def __init__(self, name_scope):
+    def __init__(self, name_scope, num_users, num_items, matrix):
        super(DeepCF, self).__init__(name_scope)
+        self._num_users = num_users
-        self._user_emb = fluid.imperative.Embedding(self.full_name(),
+        self._num_items = num_items
-                                                    [NUM_USERS, 256])
+        self._rating_matrix = self.create_parameter(
-        self._item_emb = fluid.imperative.Embedding(self.full_name(),
+            fluid.ParamAttr(trainable=False),
-                                                    [NUM_ITEMS, 256])
+            matrix.shape,
+            matrix.dtype,
+            is_bias=False,
+            default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
+        self._rating_matrix._stop_gradient = True
        self._mlp = MLP(self.full_name())
        self._dmf = DMF(self.full_name())
        self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
    def forward(self, users, items):
-        users_emb = self._user_emb(users)
+        # users_emb = self._user_emb(users)
-        items_emb = self._item_emb(items)
+        # items_emb = self._item_emb(items)
+        users_emb = fluid.layers.gather(self._rating_matrix, users)
+        items_emb = fluid.layers.gather(
+            fluid.layers.transpose(self._rating_matrix, [1, 0]), items)
+        users_emb.stop_gradient = True
+        items_emb.stop_gradient = True
        mlp_predictive = self._mlp(users_emb, items_emb)
        dmf_predictive = self._dmf(users_emb, items_emb)
@@ -116,27 +127,79 @@ def get_data():
    user_ids = []
    item_ids = []
    labels = []
+    NUM_USERS = 100
+    NUM_ITEMS = 1000
+    matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32)
    for uid in range(NUM_USERS):
        for iid in range(NUM_ITEMS):
-            # 10% positive
+            label = float(random.randint(1, 6) == 1)
-            label = float(random.randint(1, 10) == 1)
            user_ids.append(uid)
            item_ids.append(iid)
            labels.append(label)
-    indices = np.arange(NUM_USERS * NUM_ITEMS)
+            matrix[uid, iid] = label
+    indices = np.arange(len(user_ids))
+    np.random.shuffle(indices)
+    users_np = np.array(user_ids, dtype=np.int32)[indices]
+    items_np = np.array(item_ids, dtype=np.int32)[indices]
+    labels_np = np.array(labels, dtype=np.float32)[indices]
+    return np.expand_dims(users_np, -1), \
+           np.expand_dims(items_np, -1), \
+           np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix
+def load_data(DATA_PATH):
+    sys.stderr.write('loading from %s\n' % DATA_PATH)
+    likes = dict()
+    num_users = -1
+    num_items = -1
+    with open(DATA_PATH, 'r') as f:
+        for l in f.readlines():
+            uid, iid, rating = [int(v) for v in l.split('\t')]
+            num_users = max(num_users, uid + 1)
+            num_items = max(num_items, iid + 1)
+            if float(rating) > 0.0:
+                likes[(uid, iid)] = 1.0
+    user_ids = []
+    item_ids = []
+    labels = []
+    matrix = np.zeros([num_users, num_items], dtype=np.float32)
+    for uid, iid in likes.keys():
+        user_ids.append(uid)
+        item_ids.append(iid)
+        labels.append(1.0)
+        matrix[uid, iid] = 1.0
+        negative = 0
+        while negative < 3:
+            nuid = random.randint(0, num_users - 1)
+            niid = random.randint(0, num_items - 1)
+            if (nuid, niid) not in likes:
+                negative += 1
+                user_ids.append(nuid)
+                item_ids.append(niid)
+                labels.append(0.0)
+    indices = np.arange(len(user_ids))
    np.random.shuffle(indices)
-    users_np = np.array(user_ids, dtype=np.int64)[indices]
+    users_np = np.array(user_ids, dtype=np.int32)[indices]
-    items_np = np.array(item_ids, dtype=np.int64)[indices]
+    items_np = np.array(item_ids, dtype=np.int32)[indices]
    labels_np = np.array(labels, dtype=np.float32)[indices]
    return np.expand_dims(users_np, -1), \
           np.expand_dims(items_np, -1), \
-           np.expand_dims(labels_np, -1)
+           np.expand_dims(labels_np, -1), num_users, num_items, matrix
 class TestImperativeDeepCF(unittest.TestCase):
-    def test_gan_float32(self):
+    def test_deefcf(self):
        seed = 90
-        users_np, items_np, labels_np = get_data()
+        if DATA_PATH:
+            (users_np, items_np, labels_np, num_users, num_items,
+             matrix) = load_data(DATA_PATH)
+        else:
+            (users_np, items_np, labels_np, num_users, num_items,
+             matrix) = get_data()
        startup = fluid.Program()
        startup.random_seed = seed
@@ -145,11 +208,11 @@ class TestImperativeDeepCF(unittest.TestCase):
        scope = fluid.core.Scope()
        with new_program_scope(main=main, startup=startup, scope=scope):
-            users = fluid.layers.data('users', [1], dtype='int64')
+            users = fluid.layers.data('users', [1], dtype='int32')
-            items = fluid.layers.data('items', [1], dtype='int64')
+            items = fluid.layers.data('items', [1], dtype='int32')
            labels = fluid.layers.data('labels', [1], dtype='float32')
-            deepcf = DeepCF('deepcf')
+            deepcf = DeepCF('deepcf', num_users, num_items, matrix)
            prediction = deepcf(users, items)
            loss = fluid.layers.reduce_sum(
                fluid.layers.log_loss(prediction, labels))
@@ -159,35 +222,44 @@ class TestImperativeDeepCF(unittest.TestCase):
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            exe.run(startup)
-            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+            for e in range(NUM_EPOCHES):
-                static_loss = exe.run(
+                sys.stderr.write('epoch %d\n' % e)
-                    main,
+                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    feed={
+                    if slice + BATCH_SIZE >= users_np.shape[0]:
-                        users.name: users_np[slice:slice + BATCH_SIZE],
+                        break
-                        items.name: items_np[slice:slice + BATCH_SIZE],
+                    static_loss = exe.run(
-                        labels.name: labels_np[slice:slice + BATCH_SIZE]
+                        main,
-                    },
+                        feed={
-                    fetch_list=[loss])[0]
+                            users.name: users_np[slice:slice + BATCH_SIZE],
-                sys.stderr.write('static loss %s\n' % static_loss)
+                            items.name: items_np[slice:slice + BATCH_SIZE],
+                            labels.name: labels_np[slice:slice + BATCH_SIZE]
+                        },
+                        fetch_list=[loss])[0]
+                    sys.stderr.write('static loss %s\n' % static_loss)
        with fluid.imperative.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
-            deepcf = DeepCF('deepcf')
+            deepcf = DeepCF('deepcf', num_users, num_items, matrix)
-            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+            adam = fluid.optimizer.AdamOptimizer(0.01)
-                prediction = deepcf(
+            for e in range(NUM_EPOCHES):
-                    to_variable(users_np[slice:slice + BATCH_SIZE]),
+                sys.stderr.write('epoch %d\n' % e)
-                    to_variable(items_np[slice:slice + BATCH_SIZE]))
+                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                loss = fluid.layers.reduce_sum(
+                    if slice + BATCH_SIZE >= users_np.shape[0]:
-                    fluid.layers.log_loss(prediction,
+                        break
-                                          to_variable(labels_np[slice:slice +
+                    prediction = deepcf(
-                                                                BATCH_SIZE])))
+                        to_variable(users_np[slice:slice + BATCH_SIZE]),
-                loss._backward()
+                        to_variable(items_np[slice:slice + BATCH_SIZE]))
-                adam = fluid.optimizer.AdamOptimizer(0.01)
+                    loss = fluid.layers.reduce_sum(
-                adam.minimize(loss)
+                        fluid.layers.log_loss(prediction,
-                deepcf.clear_gradients()
+                                              to_variable(labels_np[
-                dy_loss = loss._numpy()
+                                                  slice:slice + BATCH_SIZE])))
+                    loss._backward()
+                    adam.minimize(loss)
+                    deepcf.clear_gradients()
+                    dy_loss = loss._numpy()
+                    sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
        self.assertEqual(static_loss, dy_loss)

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -59,7 +59,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                dtype="float32",
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-self._init_scale, high=self._init_scale))
-            self.weight_1_arr.append(weight_1)
+            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
            bias_1 = self.create_parameter(
                attr=fluid.ParamAttr(
                    initializer=fluid.initializer.UniformInitializer(
@@ -67,7 +67,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                shape=[self._hidden_size * 4],
                dtype="float32",
                default_initializer=fluid.initializer.Constant(0.0))
-            self.bias_arr.append(bias_1)
+            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
    def forward(self, input_embedding, init_hidden=None, init_cell=None):
        self.cell_array = []
@@ -242,7 +242,7 @@ class TestImperativePtbRnn(unittest.TestCase):
            dy_loss = None
            last_hidden = None
            last_cell = None
-            batch_num = 50
+            batch_num = 200
            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -264,8 +264,10 @@ class TestImperativePtbRnn(unittest.TestCase):
                        dy_param_init[param.name] = param._numpy()
                dy_loss._backward()
                sgd.minimize(dy_loss)
-                for param in ptb_model.parameters():
+                ptb_model.clear_gradients()
-                    dy_param_updated[param.name] = param._numpy()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param._numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
@@ -323,25 +325,28 @@ class TestImperativePtbRnn(unittest.TestCase):
                              },
                              fetch_list=fetch_list)
                static_loss_value = out[0]
-                static_last_cell_value = out[1]
+                static_last_hidden_value = out[1]
-                static_last_hidden_value = out[2]
+                static_last_cell_value = out[2]
-                for k in range(3, len(out)):
-                    static_param_updated[static_param_name_list[k - 3]] = out[k]
+                if i == batch_num - 1:
+                    for k in range(3, len(out)):
+                        static_param_updated[static_param_name_list[k -
+                                                                    3]] = out[k]
+        self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
+        self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
+        self.assertTrue(
+            np.allclose(static_last_hidden_value, last_hidden._numpy()))
+        for key, value in six.iteritems(static_param_init):
+            # print("static_init name: {}, value {}".format(key, value))
+            # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
+            self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5))
+        for key, value in six.iteritems(static_param_updated):
+            # print("static name: {}, value {}".format(key, value))
+            # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
            self.assertTrue(
-                np.allclose(static_loss_value.all(), dy_loss._numpy().all()))
+                np.allclose(
-            self.assertTrue(
+                    value, dy_param_updated[key], atol=1e-5))
-                np.allclose(static_last_cell_value.all(),
-                            last_cell._numpy().all()))
-            self.assertTrue(
-                np.allclose(static_last_hidden_value.all(),
-                            last_hidden._numpy().all()))
-            for key, value in six.iteritems(static_param_init):
-                self.assertTrue(
-                    np.allclose(value.all(), dy_param_init[key].all()))
-            for key, value in six.iteritems(static_param_updated):
-                self.assertTrue(
-                    np.allclose(value.all(), dy_param_updated[key].all()))
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from test_imperative_base import new_program_scope
+from paddle.fluid import core
+import numpy as np
+import six
+np.set_printoptions(suppress=True)
+# Copy from models
+class TrainTaskConfig(object):
+    # support both CPU and GPU now.
+    use_gpu = True
+    # the epoch number to train.
+    pass_num = 30
+    # the number of sequences contained in a mini-batch.
+    # deprecated, set batch_size in args.
+    batch_size = 32
+    # the hyper parameters for Adam optimizer.
+    # This static learning_rate will be multiplied to the LearningRateScheduler
+    # derived learning rate the to get the final learning rate.
+    learning_rate = 2.0
+    beta1 = 0.9
+    beta2 = 0.997
+    eps = 1e-9
+    # the parameters for learning rate scheduling.
+    warmup_steps = 8000
+    # the weight used to mix up the ground-truth distribution and the fixed
+    # uniform distribution in label smoothing when training.
+    # Set this as zero if label smoothing is not wanted.
+    label_smooth_eps = 0.1
+    # the directory for saving trained models.
+    model_dir = "trained_models"
+    # the directory for saving checkpoints.
+    ckpt_dir = "trained_ckpts"
+    # the directory for loading checkpoint.
+    # If provided, continue training from the checkpoint.
+    ckpt_path = None
+    # the parameter to initialize the learning rate scheduler.
+    # It should be provided if use checkpoints, since the checkpoint doesn't
+    # include the training step counter currently.
+    start_step = 0
+    # the frequency to save trained models.
+    save_freq = 10000
+class InferTaskConfig(object):
+    use_gpu = True
+    # the number of examples in one run for sequence generation.
+    batch_size = 10
+    # the parameters for beam search.
+    beam_size = 5
+    max_out_len = 256
+    # the number of decoded sentences to output.
+    n_best = 1
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = True
+    # the directory for loading the trained model.
+    model_path = "trained_models/pass_1.infer.model"
+class ModelHyperParams(object):
+    # These following five vocabularies related configurations will be set
+    # automatically according to the passed vocabulary path and special tokens.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <bos> token
+    bos_idx = 0
+    # index for <eos> token
+    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
+    # max length of sequences deciding the size of position encoding table.
+    max_length = 4
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 2048
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rates of different modules.
+    prepostprocess_dropout = 0.1
+    attention_dropout = 0.1
+    relu_dropout = 0.1
+    # to process before each sub-layer
+    preprocess_cmd = "n"  # layer normalization
+    # to process after each sub-layer
+    postprocess_cmd = "da"  # dropout + residual connection
+    # random seed used in dropout for CE.
+    dropout_seed = 1
+    # the flag indicating whether to share embedding and softmax weights.
+    # vocabularies in source and target should be same for weight sharing.
+    weight_sharing = True
+def merge_cfg_from_list(cfg_list, g_cfgs):
+    """
+    Set the above global configurations using the cfg_list.
+    """
+    assert len(cfg_list) % 2 == 0
+    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
+        for g_cfg in g_cfgs:
+            if hasattr(g_cfg, key):
+                try:
+                    value = eval(value)
+                except Exception:  # for file path
+                    pass
+                setattr(g_cfg, key, value)
+                break
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    channels = d_pos_vec
+    position = np.arange(n_position)
+    num_timescales = channels // 2
+    log_timescale_increment = (np.log(float(1e4) / float(1)) /
+                               (num_timescales - 1))
+    inv_timescales = np.exp(np.arange(
+        num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
+                                                               0)
+    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
+    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
+    position_enc = signal
+    return position_enc.astype("float32")
+def create_data(is_static=False):
+    if is_static:
+        return [
+            src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
+            trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
+            lbl_weight_np
+        ]
+    else:
+        enc_inputs = [
+            to_variable(src_word_np), to_variable(src_pos_np),
+            to_variable(src_slf_attn_bias_np)
+        ]
+        dec_inputs = [
+            to_variable(trg_word_np), to_variable(trg_pos_np),
+            to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np)
+        ]
+        label = to_variable(lbl_word_np)
+        weight = to_variable(lbl_weight_np)
+        return enc_inputs, dec_inputs, label, weight
+def create_feed_dict_list(data, init=False):
+    if init:
+        data_input_names = encoder_data_input_fields + \
+                           decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
+    else:
+        data_input_names = encoder_data_input_fields + \
+                           decoder_data_input_fields[:-1] + label_data_input_fields
+    feed_dict_list = dict()
+    for i in range(len(data_input_names)):
+        feed_dict_list[data_input_names[i]] = data[i]
+    return feed_dict_list
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = fluid.layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
+            append_batch_size=False)
+        inputs.append(input_var)
+    return inputs
+# The placeholder for batch_size in compile time. Must be -1 currently to be
+# consistent with some ops' infer-shape output in compile time, such as the
+# sequence_expand op used in beamsearch decoder.
+batch_size = 32
+# The placeholder for squence length in compile time.
+seq_len = ModelHyperParams.max_length
+# Here list the data shapes and data types of all inputs.
+# The shapes here act as placeholder and are set to pass the infer-shape in
+# compile time.
+input_descs = {
+    # The actual data shape of src_word is:
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_word": [(batch_size, seq_len, 1), "int64", 2],
+    # The actual data shape of src_pos is:
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_pos": [(batch_size, seq_len, 1), "int64"],
+    # This input is used to remove attention weights on paddings in the
+    # encoder.
+    # The actual data shape of src_slf_attn_bias is:
+    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
+    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # The actual data shape of trg_word is:
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_word": [(batch_size, seq_len, 1), "int64",
+                 2],  # lod_level is only used in fast decoder.
+    # The actual data shape of trg_pos is:
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_pos": [(batch_size, seq_len, 1), "int64"],
+    # This input is used to remove attention weights on paddings and
+    # subsequent words in the decoder.
+    # The actual data shape of trg_slf_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
+    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used to remove attention weights on paddings of the source
+    # input in the encoder-decoder attention.
+    # The actual data shape of trg_src_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
+    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used in independent decoder program for inference.
+    # The actual data shape of enc_output is:
+    # [batch_size, max_src_len_in_batch, d_model]
+    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
+    # The actual data shape of label_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_word": [(batch_size * seq_len, 1), "int64"],
+    # This input is used to mask out the loss of paddding tokens.
+    # The actual data shape of label_weight is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
+    # This input is used in beam-search decoder.
+    "init_score": [(batch_size, 1), "float32", 2],
+    # This input is used in beam-search decoder for the first gather
+    # (cell states updation)
+    "init_idx": [(batch_size, ), "int32"],
+}
+# Names of word embedding table which might be reused for weight sharing.
+word_emb_param_names = (
+    "src_word_emb_table",
+    "trg_word_emb_table", )
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+# separated inputs for different usages.
+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+decoder_data_input_fields = (
+    "trg_word",
+    "trg_pos",
+    "trg_slf_attn_bias",
+    "trg_src_attn_bias",
+    "enc_output", )
+label_data_input_fields = (
+    "lbl_word",
+    "lbl_weight", )
+# In fast decoder, trg_pos (only containing the current time step) is generated
+# by ops and trg_slf_attn_bias is not needed.
+fast_decoder_data_input_fields = (
+    "trg_word",
+    "init_score",
+    "init_idx",
+    "trg_src_attn_bias", )
+# if we use py_reader
+use_py_reader = False
+# if we run sync mode
+sync = False
+# how many batches we use
+batch_num = 2
+np.random.seed = 1
+src_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size, seq_len, 1),
+    dtype='int64')
+src_pos_np = np.random.randint(
+    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+trg_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size, seq_len, 1),
+    dtype='int64')
+trg_pos_np = np.random.randint(
+    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+lbl_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size * seq_len, 1),
+    dtype='int64')
+lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
+# np.random.seed = 1
+# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
+# src_pos_np = np.random.randint(
+#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+#
+# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
+# trg_pos_np = np.random.randint(
+#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+#
+# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
+# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
+#
+pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
+                                  ModelHyperParams.d_model)
+pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
+                                  ModelHyperParams.d_model)
+class PrePostProcessLayer(Layer):
+    def __init__(self, name_scope, process_cmd, shape_len=None):
+        super(PrePostProcessLayer, self).__init__(name_scope)
+        for cmd in process_cmd:
+            if cmd == "n":
+                self._layer_norm = LayerNorm(
+                    name_scope=self.full_name(),
+                    begin_norm_axis=shape_len - 1,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.)),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(0.)))
+    def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
+        for cmd in process_cmd:
+            if cmd == "a":  # add residual connection
+                out = out + prev_out if prev_out else out
+            elif cmd == "n":  # add layer normalization
+                out = self._layer_norm(out)
+            elif cmd == "d":  # add dropout
+                if dropout_rate:
+                    out = fluid.layers.dropout(
+                        out,
+                        dropout_prob=dropout_rate,
+                        seed=ModelHyperParams.dropout_seed,
+                        is_test=False)
+        return out
+class PositionwiseFeedForwardLayer(Layer):
+    def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
+        super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
+        self._i2h = FC(name_scope=self.full_name(),
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act="relu")
+        self._h2o = FC(name_scope=self.full_name(),
+                       size=d_hid,
+                       num_flatten_dims=2)
+        self._dropout_rate = dropout_rate
+    def forward(self, x):
+        hidden = self._i2h(x)
+        if self._dropout_rate:
+            hidden = fluid.layers.dropout(
+                hidden,
+                dropout_prob=self._dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+        out = self._h2o(hidden)
+        return out
+class MultiHeadAttentionLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.,
+                 cache=None,
+                 gather_idx=None,
+                 static_kv=False):
+        super(MultiHeadAttentionLayer, self).__init__(name_scope)
+        self._n_head = n_head
+        self._d_key = d_key
+        self._d_value = d_value
+        self._d_model = d_model
+        self._dropout_rate = dropout_rate
+        self._q_fc = FC(name_scope=self.full_name(),
+                        size=d_key * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._k_fc = FC(name_scope=self.full_name(),
+                        size=d_key * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._v_fc = FC(name_scope=self.full_name(),
+                        size=d_value * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._proj_fc = FC(name_scope=self.full_name(),
+                           size=self._d_model,
+                           bias_attr=False,
+                           num_flatten_dims=2)
+    def forward(self, queries, keys, values, attn_bias):
+        # compute q ,k ,v
+        keys = queries if keys is None else keys
+        values = keys if values is None else values
+        q = self._q_fc(queries)
+        k = self._k_fc(keys)
+        v = self._v_fc(values)
+        # split head
+        reshaped_q = fluid.layers.reshape(
+            x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
+        transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
+        reshaped_k = fluid.layers.reshape(
+            x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
+        transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
+        reshaped_v = fluid.layers.reshape(
+            x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
+        transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
+        #scale dot product attention
+        product = fluid.layers.matmul(
+            x=transpose_q,
+            y=transpose_k,
+            transpose_y=True,
+            alpha=self._d_model**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = fluid.layers.softmax(product)
+        if self._dropout_rate:
+            weights_droped = fluid.layers.dropout(
+                weights,
+                dropout_prob=self._dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+            out = fluid.layers.matmul(weights_droped, transpose_v)
+        else:
+            out = fluid.layers.matmul(weights, transpose_v)
+        # combine heads
+        if len(out.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
+        final_out = fluid.layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=False)
+        # fc to output
+        proj_out = self._proj_fc(final_out)
+        return proj_out
+class EncoderSubLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(EncoderSubLayer, self).__init__(name_scope)
+        self._preprocess_cmd = preprocess_cmd
+        self._postprocess_cmd = postprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
+                                                     self._preprocess_cmd, 3)
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            self.full_name(), d_key, d_value, d_model, n_head,
+            attention_dropout)
+        self._postprocess_layer = PrePostProcessLayer(
+            self.full_name(), self._postprocess_cmd, None)
+        self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
+                                                      self._preprocess_cmd, 3)
+        self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
+            self.full_name(), d_inner_hid, d_model, relu_dropout)
+        self._postprocess_layer2 = PrePostProcessLayer(
+            self.full_name(), self._postprocess_cmd, None)
+    def forward(self, enc_input, attn_bias):
+        pre_process_multihead = self._preprocess_layer(
+            None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
+        attn_output = self._multihead_attention_layer(pre_process_multihead,
+                                                      None, None, attn_bias)
+        attn_output = self._postprocess_layer(enc_input, attn_output,
+                                              self._postprocess_cmd,
+                                              self._prepostprocess_dropout)
+        pre_process2_output = self._preprocess_layer2(
+            None, attn_output, self._preprocess_cmd,
+            self._prepostprocess_dropout)
+        ffd_output = self._positionwise_feed_forward(pre_process2_output)
+        return self._postprocess_layer2(attn_output, ffd_output,
+                                        self._postprocess_cmd,
+                                        self._prepostprocess_dropout)
+class EncoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(EncoderLayer, self).__init__(name_scope)
+        self._preprocess_cmd = preprocess_cmd
+        self._encoder_sublayers = list()
+        self._prepostprocess_dropout = prepostprocess_dropout
+        self._n_layer = n_layer
+        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
+                                                     self._preprocess_cmd, 3)
+        for i in range(n_layer):
+            self._encoder_sublayers.append(
+                self.add_sublayer(
+                    'esl_%d' % i,
+                    EncoderSubLayer(
+                        self.full_name(), n_head, d_key, d_value, d_model,
+                        d_inner_hid, prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+    def forward(self, enc_input, attn_bias):
+        for i in range(self._n_layer):
+            enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
+            enc_input = enc_output
+        return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
+                                      self._prepostprocess_dropout)
+class PrepareEncoderDecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 src_vocab_size,
+                 src_emb_dim,
+                 src_max_len,
+                 dropout_rate,
+                 word_emb_param_name=None,
+                 pos_enc_param_name=None):
+        super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
+        self._src_max_len = src_max_len
+        self._src_emb_dim = src_emb_dim
+        self._src_vocab_size = src_vocab_size
+        self._dropout_rate = dropout_rate
+        self._input_emb = Embedding(
+            name_scope=self.full_name(),
+            size=[src_vocab_size, src_emb_dim],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+        if pos_enc_param_name is pos_enc_param_names[0]:
+            pos_inp = pos_inp1
+        else:
+            pos_inp = pos_inp2
+        self._pos_emb = Embedding(
+            name_scope=self.full_name(),
+            size=[self._src_max_len, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=pos_enc_param_name,
+                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                trainable=False))
+        # use in imperative_mode to fit different length batch
+        # self._pos_emb._w = to_variable(
+        #     position_encoding_init(self._src_max_len, self._src_emb_dim))
+    def forward(self, src_word, src_pos):
+        src_word_emb = self._input_emb(src_word)
+        src_word_emb = fluid.layers.scale(
+            x=src_word_emb, scale=self._src_emb_dim**0.5)
+        # # TODO change this to fit dynamic length input
+        src_pos_emb = self._pos_emb(src_pos)
+        src_pos_emb.stop_gradient = True
+        enc_input = src_word_emb + src_pos_emb
+        return fluid.layers.dropout(
+            enc_input,
+            dropout_prob=self._dropout_rate,
+            seed=ModelHyperParams.dropout_seed,
+            is_test=False) if self._dropout_rate else enc_input
+class WrapEncoderLayer(Layer):
+    def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
+                 d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+                 attention_dropout, relu_dropout, preprocess_cmd,
+                 postprocess_cmd, weight_sharing):
+        """
+        The wrapper assembles together all needed layers for the encoder.
+        """
+        super(WrapEncoderLayer, self).__init__(name_cope)
+        self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
+            self.full_name(),
+            src_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            word_emb_param_name=word_emb_param_names[0],
+            pos_enc_param_name=pos_enc_param_names[0])
+        self._encoder = EncoderLayer(
+            self.full_name(), n_layer, n_head, d_key, d_value, d_model,
+            d_inner_hid, prepostprocess_dropout, attention_dropout,
+            relu_dropout, preprocess_cmd, postprocess_cmd)
+    def forward(self, enc_inputs):
+        src_word, src_pos, src_slf_attn_bias = enc_inputs
+        enc_input = self._prepare_encoder_layer(src_word, src_pos)
+        enc_output = self._encoder(enc_input, src_slf_attn_bias)
+        return enc_output
+class DecoderSubLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 cache=None,
+                 gather_idx=None):
+        super(DecoderSubLayer, self).__init__(name_scope)
+        self._postprocess_cmd = postprocess_cmd
+        self._preprocess_cmd = preprocess_cmd
+        self._prepostprcess_dropout = prepostprocess_dropout
+        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
+                                                      preprocess_cmd, 3)
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            self.full_name(),
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            cache=cache,
+            gather_idx=gather_idx)
+        self._post_process_layer = PrePostProcessLayer(self.full_name(),
+                                                       postprocess_cmd, None)
+        self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
+                                                       preprocess_cmd, 3)
+        self._multihead_attention_layer2 = MultiHeadAttentionLayer(
+            self.full_name(),
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            cache=cache,
+            gather_idx=gather_idx,
+            static_kv=True)
+        self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
+                                                        postprocess_cmd, None)
+        self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
+                                                       preprocess_cmd, 3)
+        self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
+            self.full_name(), d_inner_hid, d_model, relu_dropout)
+        self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
+                                                        postprocess_cmd, None)
+    def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
+        pre_process_rlt = self._pre_process_layer(
+            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
+        slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
+                                                          None, slf_attn_bias)
+        slf_attn_output_pp = self._post_process_layer(
+            dec_input, slf_attn_output, self._postprocess_cmd,
+            self._prepostprcess_dropout)
+        pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
+                                                    self._preprocess_cmd,
+                                                    self._prepostprcess_dropout)
+        enc_attn_output_pp = self._multihead_attention_layer2(
+            pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
+        enc_attn_output = self._post_process_layer2(
+            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            self._prepostprcess_dropout)
+        pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
+                                                    self._preprocess_cmd,
+                                                    self._prepostprcess_dropout)
+        ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
+        dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
+                                               self._postprocess_cmd,
+                                               self._prepostprcess_dropout)
+        return dec_output
+class DecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 caches=None,
+                 gather_idx=None):
+        super(DecoderLayer, self).__init__(name_scope)
+        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
+                                                      preprocess_cmd, 3)
+        self._decoder_sub_layers = list()
+        self._n_layer = n_layer
+        self._preprocess_cmd = preprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+        for i in range(n_layer):
+            self._decoder_sub_layers.append(
+                self.add_sublayer(
+                    'dsl_%d' % i,
+                    DecoderSubLayer(
+                        self.full_name(),
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        cache=None if caches is None else caches[i],
+                        gather_idx=gather_idx)))
+    def forward(self, dec_input, enc_output, dec_slf_attn_bias,
+                dec_enc_attn_bias):
+        for i in range(self._n_layer):
+            tmp_dec_output = self._decoder_sub_layers[i](
+                dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
+            dec_input = tmp_dec_output
+        dec_output = self._pre_process_layer(None, tmp_dec_output,
+                                             self._preprocess_cmd,
+                                             self._prepostprocess_dropout)
+        return dec_output
+class WrapDecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 caches=None,
+                 gather_idx=None):
+        """
+        The wrapper assembles together all needed layers for the encoder.
+        """
+        super(WrapDecoderLayer, self).__init__(name_scope)
+        self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
+            self.full_name(),
+            trg_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            word_emb_param_name=word_emb_param_names[1],
+            pos_enc_param_name=pos_enc_param_names[1])
+        self._decoder_layer = DecoderLayer(
+            self.full_name(),
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd,
+            postprocess_cmd,
+            caches=caches,
+            gather_idx=gather_idx)
+        self._weight_sharing = weight_sharing
+        if not weight_sharing:
+            self._fc = FC(self.full_name(),
+                          size=trg_vocab_size,
+                          bias_attr=False)
+    def forward(self, dec_inputs=None, enc_output=None):
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
+        dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
+        dec_output = self._decoder_layer(dec_input, enc_output,
+                                         trg_slf_attn_bias, trg_src_attn_bias)
+        dec_output_reshape = fluid.layers.reshape(
+            dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
+        if self._weight_sharing:
+            predict = fluid.layers.matmul(
+                x=dec_output_reshape,
+                y=self._prepare_decoder_layer._input_emb._w,
+                transpose_y=True)
+        else:
+            predict = self._fc(dec_output_reshape)
+        if dec_inputs is None:
+            # Return probs for independent decoder program.
+            predict_out = fluid.layers.softmax(predict)
+            return predict_out
+        return predict
+class TransFormer(Layer):
+    def __init__(self,
+                 name_scope,
+                 src_vocab_size,
+                 trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 label_smooth_eps,
+                 use_py_reader=False,
+                 is_test=False):
+        super(TransFormer, self).__init__(name_scope)
+        self._label_smooth_eps = label_smooth_eps
+        self._trg_vocab_size = trg_vocab_size
+        if weight_sharing:
+            assert src_vocab_size == trg_vocab_size, (
+                "Vocabularies in source and target should be same for weight sharing."
+            )
+        self._wrap_encoder_layer = WrapEncoderLayer(
+            self.full_name(), src_vocab_size, max_length, n_layer, n_head,
+            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
+            weight_sharing)
+        self._wrap_decoder_layer = WrapDecoderLayer(
+            self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
+            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
+            weight_sharing)
+        if weight_sharing:
+            self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
+    def forward(self, enc_inputs, dec_inputs, label, weights):
+        enc_output = self._wrap_encoder_layer(enc_inputs)
+        predict = self._wrap_decoder_layer(dec_inputs, enc_output)
+        if self._label_smooth_eps:
+            label_out = fluid.layers.label_smooth(
+                label=fluid.layers.one_hot(
+                    input=label, depth=self._trg_vocab_size),
+                epsilon=self._label_smooth_eps)
+        cost = fluid.layers.softmax_with_cross_entropy(
+            logits=predict,
+            label=label_out,
+            soft_label=True if self._label_smooth_eps else False)
+        weighted_cost = cost * weights
+        sum_cost = fluid.layers.reduce_sum(weighted_cost)
+        token_num = fluid.layers.reduce_sum(weights)
+        token_num.stop_gradient = True
+        avg_cost = sum_cost / token_num
+        return sum_cost, avg_cost, predict, token_num
+class TestImperativeTransformer(unittest.TestCase):
+    def test_transformer_float32(self):
+        seed = 90
+        with guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            transformer = TransFormer(
+                'transformer',
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=use_py_reader,
+                is_test=False)
+            if sync:
+                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
+                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
+                with fluid.default_main_program()._lr_schedule_guard():
+                    learning_rate = lr_decay * TrainTaskConfig.learning_rate
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=learning_rate,
+                    beta1=TrainTaskConfig.beta1,
+                    beta2=TrainTaskConfig.beta2,
+                    epsilon=TrainTaskConfig.eps)
+            else:
+                optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+            dy_param_init = dict()
+            dy_param_updated = dict()
+            for i in range(batch_num):
+                enc_inputs, dec_inputs, label, weights = create_data()
+                dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
+                    enc_inputs, dec_inputs, label, weights)
+                if i == 0:
+                    for param in transformer.parameters():
+                        dy_param_init[param.name] = param._numpy()
+                dy_avg_cost._backward()
+                optimizer.minimize(dy_avg_cost)
+                transformer.clear_gradients()
+                if i == batch_num - 1:
+                    for param in transformer.parameters():
+                        dy_param_updated[param.name] = param._numpy()
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            transformer = TransFormer(
+                'transformer',
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=use_py_reader,
+                is_test=False)
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+            data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                                     -1] + label_data_input_fields
+            all_inputs = make_all_inputs(data_input_names)
+            enc_inputs_len = len(encoder_data_input_fields)
+            dec_inputs_len = len(decoder_data_input_fields[:-1])
+            enc_inputs = all_inputs[0:enc_inputs_len]
+            dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
+                                    dec_inputs_len]
+            label = all_inputs[-2]
+            weights = all_inputs[-1]
+            static_param_updated = dict()
+            static_param_init = dict()
+            static_param_name_list = list()
+            static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
+                enc_inputs, dec_inputs, label, weights)
+            optimizer.minimize(static_avg_cost)
+            for param in transformer.parameters():
+                static_param_name_list.append(param.name)
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init[static_param_name_list[i]] = out[i]
+            static_sum_cost_value = None
+            static_avg_cost_value = None
+            static_predict_value = None
+            static_token_num_value = None
+            for i in range(batch_num):
+                feed_dict = create_feed_dict_list(create_data(True))
+                fetch_list = [
+                    static_sum_cost, static_avg_cost, static_predict,
+                    static_token_num
+                ]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed=feed_dict,
+                              fetch_list=fetch_list)
+                static_sum_cost_value = out[0]
+                static_avg_cost_value = out[1]
+                static_predict_value = out[2]
+                static_token_num_value = out[3]
+                if i == batch_num - 1:
+                    for k in range(4, len(out)):
+                        static_param_updated[static_param_name_list[k -
+                                                                    4]] = out[k]
+        self.assertTrue(
+            np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
+        self.assertTrue(
+            np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
+        self.assertTrue(
+            np.allclose(
+                static_predict_value, dy_predict._numpy(), atol=1e-5))
+        self.assertTrue(
+            np.allclose(static_token_num_value, dy_token_num._numpy()))
+        for key, value in six.iteritems(static_param_init):
+            self.assertTrue(np.allclose(value, dy_param_init[key]))
+        for key, value in six.iteritems(static_param_updated):
+            self.assertTrue(
+                np.allclose(
+                    value, dy_param_updated[key], atol=1e-4))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+class TestInstallCheck(unittest.TestCase):
+    def test_install_check(self):
+        fluid.install_check.run_check()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -42,10 +42,14 @@ class LayerTest(unittest.TestCase):
    def tearDownClass(cls):
        pass
-    def _get_place(self):
+    def _get_place(self, force_to_use_cpu=False):
-        if core.is_compiled_with_cuda():
+        # this option for ops that only have cpu kernel
-            return core.CUDAPlace(0)
+        if force_to_use_cpu:
-        return core.CPUPlace()
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
    @contextlib.contextmanager
    def static_graph(self):
@@ -54,22 +58,52 @@ class LayerTest(unittest.TestCase):
            fluid.default_main_program().random_seed = self.seed
            yield
-    def get_static_graph_result(self, feed, fetch_list):
+    def get_static_graph_result(self, feed, fetch_list, with_lod=False):
        exe = fluid.Executor(self._get_place())
        exe.run(fluid.default_startup_program())
        return exe.run(fluid.default_main_program(),
                       feed=feed,
-                       fetch_list=fetch_list)
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
    @contextlib.contextmanager
-    def dynamic_graph(self):
+    def dynamic_graph(self, force_to_use_cpu=False):
-        with fluid.imperative.guard(self._get_place()):
+        with fluid.imperative.guard(
+                self._get_place(force_to_use_cpu=force_to_use_cpu)):
            fluid.default_startup_program().random_seed = self.seed
            fluid.default_main_program().random_seed = self.seed
            yield
 class TestLayer(LayerTest):
+    def test_layer_norm(self):
+        inp = np.ones([3, 32, 32], dtype='float32')
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            ret = layers.layer_norm(t)
+            static_ret = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            lm = nn.LayerNorm('layer_norm')
+            ret = lm(t)
+            static_ret2 = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.dynamic_graph():
+            lm = nn.LayerNorm('layer_norm')
+            dy_ret = lm(base.to_variable(inp))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+        self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
    def test_relu(self):
        with self.static_graph():
            t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -228,6 +262,304 @@ class TestLayer(LayerTest):
        self.assertTrue(np.allclose(n, min_ret._numpy()))
        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+    def test_sequence_conv(self):
+        inp_np = np.arange(12).reshape([3, 4]).astype('float32')
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with self.static_graph():
+            seq = layers.data(
+                name='seq_in',
+                shape=[3, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            out = layers.sequence_conv(seq, 2)
+            static_rlt = self.get_static_graph_result(
+                feed={
+                    "seq_in": fluid.create_lod_tensor(
+                        data=inp_np,
+                        recursive_seq_lens=[[1, 1, 1]],
+                        place=place)
+                },
+                fetch_list=[out],
+                with_lod=True)[0]
+        with self.static_graph():
+            seq = layers.data(
+                name='seq_in',
+                shape=[3, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            seq_conv = nn.SequenceConv('seq_conv', num_filters=2)
+            out = seq_conv(seq)
+            static_rlt2 = self.get_static_graph_result(
+                feed={
+                    "seq_in": fluid.create_lod_tensor(
+                        data=inp_np,
+                        recursive_seq_lens=[[1, 1, 1]],
+                        place=place)
+                },
+                fetch_list=[out],
+                with_lod=True)[0]
+        self.assertTrue(
+            np.allclose(np.array(static_rlt), np.array(static_rlt2)))
+    def test_conv2d_transpose(self):
+        inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32')
+        with self.static_graph():
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            out = layers.conv2d_transpose(
+                input=img, num_filters=10, output_size=28)
+            static_rlt = self.get_static_graph_result(
+                feed={'pixel': inp_np}, fetch_list=[out])[0]
+        with self.static_graph():
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            conv2d_transpose = nn.Conv2DTranspose(
+                'conv2d_transpose', num_filters=10, output_size=28)
+            out = conv2d_transpose(img)
+            static_rlt2 = self.get_static_graph_result(
+                feed={'pixel': inp_np}, fetch_list=[out])[0]
+        with self.dynamic_graph():
+            conv2d_transpose = nn.Conv2DTranspose(
+                'conv2d_transpose', num_filters=10, output_size=28)
+            dy_rlt = conv2d_transpose(base.to_variable(inp_np))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+    def test_bilinear_tensor_product(self):
+        inp_np_x = np.array([[1, 2, 3]]).astype('float32')
+        inp_np_y = np.array([[4, 5, 6]]).astype('float32')
+        with self.static_graph():
+            data_x = layers.data(
+                name='x',
+                shape=[1, 3],
+                dtype="float32",
+                append_batch_size=False)
+            data_y = layers.data(
+                name='y',
+                shape=[1, 3],
+                dtype="float32",
+                append_batch_size=False)
+            out = layers.bilinear_tensor_product(data_x, data_y, 6)
+            static_rlt = self.get_static_graph_result(
+                feed={'x': inp_np_x,
+                      'y': inp_np_y}, fetch_list=[out])[0]
+        with self.static_graph():
+            data_x = layers.data(
+                name='x',
+                shape=[1, 3],
+                dtype="float32",
+                append_batch_size=False)
+            data_y = layers.data(
+                name='y',
+                shape=[1, 3],
+                dtype="float32",
+                append_batch_size=False)
+            btp = nn.BilinearTensorProduct('btp', 6)
+            out = btp(data_x, data_y)
+            static_rlt2 = self.get_static_graph_result(
+                feed={'x': inp_np_x,
+                      'y': inp_np_y}, fetch_list=[out])[0]
+        with self.dynamic_graph():
+            btp = nn.BilinearTensorProduct('btp', 6)
+            dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+    def test_prelu(self):
+        inp_np = np.ones([5, 200, 100, 100]).astype('float32')
+        with self.static_graph():
+            data_t = layers.data(
+                name="input",
+                shape=[5, 200, 100, 100],
+                dtype="float32",
+                append_batch_size=False)
+            mode = 'channel'
+            out = layers.prelu(
+                data_t, mode, param_attr=ParamAttr(initializer=Constant(1.0)))
+            static_rlt = self.get_static_graph_result(
+                feed={"input": inp_np}, fetch_list=[out])[0]
+        with self.static_graph():
+            data_t = layers.data(
+                name="input",
+                shape=[5, 200, 100, 100],
+                dtype="float32",
+                append_batch_size=False)
+            mode = 'channel'
+            prelu = nn.PRelu(
+                'prelu',
+                mode=mode,
+                param_attr=ParamAttr(initializer=Constant(1.0)))
+            out = prelu(data_t)
+            static_rlt2 = self.get_static_graph_result(
+                feed={"input": inp_np}, fetch_list=[out])[0]
+        with self.dynamic_graph():
+            mode = 'channel'
+            prelu = nn.PRelu(
+                'prelu',
+                mode=mode,
+                param_attr=ParamAttr(initializer=Constant(1.0)))
+            dy_rlt = prelu(base.to_variable(inp_np))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+    def test_embeding(self):
+        inp_word = np.array([[[1]]]).astype('int64')
+        dict_size = 20
+        with self.static_graph():
+            data_t = layers.data(name='word', shape=[1], dtype='int64')
+            emb = layers.embedding(
+                input=data_t,
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=False)
+            static_rlt = self.get_static_graph_result(
+                feed={'word': inp_word}, fetch_list=[emb])[0]
+        with self.static_graph():
+            data_t = layers.data(name='word', shape=[1], dtype='int64')
+            emb2 = nn.Embedding(
+                name_scope='embedding',
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=False)
+            emb_rlt = emb2(data_t)
+            static_rlt2 = self.get_static_graph_result(
+                feed={'word': inp_word}, fetch_list=[emb_rlt])[0]
+        with self.dynamic_graph():
+            emb2 = nn.Embedding(
+                name_scope='embedding',
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=False)
+            static_rlt3 = emb2(base.to_variable(inp_word))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(static_rlt3._numpy(), static_rlt))
+    def test_nce(self):
+        window_size = 5
+        dict_size = 20
+        label_word = int(window_size // 2) + 1
+        inp_word = np.array([[[1]], [[2]], [[3]], [[4]], [[5]]]).astype('int64')
+        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
+        seed = 1
+        with self.static_graph():
+            words = []
+            for i in range(window_size):
+                words.append(
+                    layers.data(
+                        name='word_{0}'.format(i), shape=[1], dtype='int64'))
+            embs = []
+            for i in range(window_size):
+                if i == label_word:
+                    continue
+                emb = layers.embedding(
+                    input=words[i],
+                    size=[dict_size, 32],
+                    param_attr='emb.w',
+                    is_sparse=False)
+                embs.append(emb)
+            embs = layers.concat(input=embs, axis=1)
+            nce_loss = layers.nce(input=embs,
+                                  label=words[label_word],
+                                  num_total_classes=dict_size,
+                                  num_neg_samples=2,
+                                  sampler="custom_dist",
+                                  custom_dist=nid_freq_arr.tolist(),
+                                  seed=seed,
+                                  param_attr='nce.w',
+                                  bias_attr='nce.b')
+            feed_dict = dict()
+            for i in range(window_size):
+                feed_dict['word_{0}'.format(i)] = inp_word[i]
+            static_rlt = self.get_static_graph_result(
+                feed=feed_dict, fetch_list=[nce_loss])[0]
+        with self.static_graph():
+            words = []
+            for i in range(window_size):
+                words.append(
+                    layers.data(
+                        name='word_{0}'.format(i), shape=[1], dtype='int64'))
+            emb = nn.Embedding(
+                'embedding',
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=False)
+            embs2 = []
+            for i in range(window_size):
+                if i == label_word:
+                    continue
+                emb_rlt = emb(words[i])
+                embs2.append(emb_rlt)
+            embs2 = layers.concat(input=embs2, axis=1)
+            nce = nn.NCE('nce',
+                         num_total_classes=dict_size,
+                         num_neg_samples=2,
+                         sampler="custom_dist",
+                         custom_dist=nid_freq_arr.tolist(),
+                         seed=seed,
+                         param_attr='nce.w',
+                         bias_attr='nce.b')
+            nce_loss2 = nce(embs2, words[label_word])
+            feed_dict = dict()
+            for i in range(len(words)):
+                feed_dict['word_{0}'.format(i)] = inp_word[i]
+            static_rlt2 = self.get_static_graph_result(
+                feed=feed_dict, fetch_list=[nce_loss2])[0]
+        with self.dynamic_graph(force_to_use_cpu=True):
+            words = []
+            for i in range(window_size):
+                words.append(base.to_variable(inp_word[i]))
+            emb = nn.Embedding(
+                'embedding',
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=False)
+            embs3 = []
+            for i in range(window_size):
+                if i == label_word:
+                    continue
+                emb_rlt = emb(words[i])
+                embs3.append(emb_rlt)
+            embs3 = layers.concat(input=embs3, axis=1)
+            nce = nn.NCE('nce',
+                         num_total_classes=dict_size,
+                         num_neg_samples=2,
+                         sampler="custom_dist",
+                         custom_dist=nid_freq_arr.tolist(),
+                         seed=seed,
+                         param_attr='nce.w',
+                         bias_attr='nce.b')
+            nce_loss3 = nce(embs3, words[label_word])
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt))
 class TestBook(unittest.TestCase):
    def test_fit_a_line(self):

--- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
@@ -205,9 +205,9 @@ class TestListenAndServOp(unittest.TestCase):
                out = nce(x_array, param_array, bias_array, sample_weight,
                          label_array, 5, 2)
-                self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6)
+                np.testing.assert_almost_equal(o_cost, out[0], decimal=6)
-                self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6)
+                np.testing.assert_almost_equal(o_logits, out[1], decimal=6)
-                self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6)
+                np.testing.assert_almost_equal(o_labels, out[2], decimal=6)
    def test_nce_op_remote(self):
        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"

--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 import unittest
 import numpy
@@ -183,6 +184,58 @@ class TestTensor(unittest.TestCase):
            tensor_array = numpy.array(tensor)
            self.assertEqual((0, 1), tensor_array.shape)
+    def run_sliece_tensor(self, place):
+        tensor = fluid.Tensor()
+        shape = [3, 3, 3]
+        tensor._set_dims(shape)
+        tensor_array = numpy.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                                    [[19, 20, 21], [22, 23, 24], [25, 26, 27]]])
+        tensor.set(tensor_array, place)
+        n1 = tensor[1]
+        t1 = tensor_array[1]
+        self.assertTrue((numpy.array(n1) == numpy.array(t1)).all())
+        n2 = tensor[1:]
+        t2 = tensor_array[1:]
+        self.assertTrue((numpy.array(n2) == numpy.array(t2)).all())
+        n3 = tensor[0:2:]
+        t3 = tensor_array[0:2:]
+        self.assertTrue((numpy.array(n3) == numpy.array(t3)).all())
+        n4 = tensor[2::-2]
+        t4 = tensor_array[2::-2]
+        self.assertTrue((numpy.array(n4) == numpy.array(t4)).all())
+        n5 = tensor[2::-2][0]
+        t5 = tensor_array[2::-2][0]
+        self.assertTrue((numpy.array(n5) == numpy.array(t5)).all())
+        n6 = tensor[2:-1:-1]
+        t6 = tensor_array[2:-1:-1]
+        self.assertTrue((numpy.array(n6) == numpy.array(t6)).all())
+        n7 = tensor[0:, 0:]
+        t7 = tensor_array[0:, 0:]
+        self.assertTrue((numpy.array(n7) == numpy.array(t7)).all())
+        n8 = tensor[0::1, 0::-1, 2:]
+        t8 = tensor_array[0::1, 0::-1, 2:]
+        self.assertTrue((numpy.array(n8) == numpy.array(t8)).all())
+    def test_sliece_tensor(self):
+        # run cpu first
+        place = core.CPUPlace()
+        self.run_sliece_tensor(place)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.run_sliece_tensor(place)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -16,8 +16,10 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
+from test_imperative_base import new_program_scope
 class TestVariable(unittest.TestCase):
@@ -60,6 +62,100 @@ class TestVariable(unittest.TestCase):
            name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
        self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
+    def _test_slice(self):
+        b = default_main_program().current_block()
+        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
+        for i in range(3):
+            nw = w[i]
+            self.assertEqual((1, 100, 100), nw.shape)
+        nw = w[:]
+        self.assertEqual((784, 100, 100), nw.shape)
+        nw = w[:, :, ...]
+        self.assertEqual((784, 100, 100), nw.shape)
+        nw = w[::2, ::2, :]
+        self.assertEqual((392, 50, 100), nw.shape)
+        nw = w[::-2, ::-2, :]
+        self.assertEqual((392, 50, 100), nw.shape)
+        self.assertEqual(0, nw.lod_level)
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            exe = fluid.Executor(place)
+            tensor_array = np.array(
+                [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                 [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                 [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+            var = fluid.layers.assign(tensor_array)
+            var1 = var[0, 1, 1]
+            var2 = var[1:]
+            var3 = var[0:1]
+            var4 = var[..., ]
+            var5 = var[2::-2]
+            var6 = var[1, 1:, 1:]
+            var7 = var[1, ..., 1:]
+            var8 = var[1, ...]
+            local_out = exe.run(main,
+                                fetch_list=[
+                                    var, var1, var2, var3, var4, var5, var6,
+                                    var7, var8
+                                ])
+            self.assertTrue((np.array(local_out[1]) == np.array(tensor_array[
+                0, 1, 1])).all())
+            self.assertTrue((np.array(local_out[2]) == np.array(tensor_array[
+                1:])).all())
+            self.assertTrue((np.array(local_out[3]) == np.array(tensor_array[
+                0:1])).all())
+            self.assertTrue((np.array(local_out[4]) == np.array(
+                tensor_array[..., ])).all())
+            self.assertTrue((np.array(local_out[5]) == np.array(tensor_array[
+                2::-2])).all())
+            self.assertTrue((np.array(local_out[6]) == np.array(tensor_array[
+                1, 1:, 1:])).all())
+            self.assertTrue((np.array(local_out[7]) == np.array(tensor_array[
+                1, ..., 1:])).all())
+            self.assertTrue((np.array(local_out[8]) == np.array(tensor_array[
+                1, ...])).all())
+    def test_slice(self):
+        self._test_slice()
+class TestVariableImperative(unittest.TestCase):
+    def _test_slice(self):
+        b = default_main_program().current_block()
+        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
+        for i in range(3):
+            nw = w[i]
+            self.assertEqual([1, 100, 100], nw.shape)
+        nw = w[:]
+        self.assertEqual([784, 100, 100], nw.shape)
+        nw = w[:, :, :]
+        self.assertEqual([784, 100, 100], nw.shape)
+        nw = w[::2, ::2, :]
+        self.assertEqual([392, 50, 100], nw.shape)
+        nw = w[::-2, ::-2, :]
+        self.assertEqual([392, 50, 100], nw.shape)
+        nw = w[0::-2, 0::-2, :]
+        self.assertEqual([1, 1, 100], nw.shape)
+    def test_slice(self):
+        with fluid.imperative.guard():
+            self._test_slice()
 if __name__ == '__main__':
    unittest.main()