Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into zeros_like

a55111b8 · zhoukunsheng · 848ec97a · c7c6eeb4 · a55111b8 · a55111b8
206 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
+# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
@@ -190,6 +191,7 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+    include(anakin_subgraph)
 endif()
 if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -156,7 +156,7 @@ python \
 This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/)
--- a/cmake/anakin_subgraph.cmake
+++ b/cmake/anakin_subgraph.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
+find_path(ANAKIN_INCLUDE_DIR anakin_config.h
+    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
+    NO_DEFAULT_PATH
+)
+find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
+    PATHS ${ANAKIN_ROOT}
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to ANAKIN library.")
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+  if(WITH_DSO)
+    set(ANAKIN_FOUND ON)
+  endif(WITH_DSO)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+if(ANAKIN_FOUND)
+    message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT}/include)
+    include_directories(${ANAKIN_ROOT}/include/saber)
+    link_directories(${ANAKIN_ROOT})
+    add_definitions(-DPADDLE_WITH_ANAKIN)
+endif()
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
    include_directories(${TENSORRT_INCLUDE_DIR})
+    link_directories(${TENSORRT_LIBRARY})
    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -521,6 +521,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
+paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
 paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
 paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
 paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
-cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
@@ -164,6 +164,8 @@ else()
  set(NGRAPH_EXE_DEPS)
 endif()
+cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
    lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
@@ -174,7 +176,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
-target_link_libraries(executor garbage_collector while_op_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
@@ -194,6 +196,7 @@ cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_con
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
 cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
@@ -72,7 +73,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -52,13 +53,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
  //               Note that must assert topology sort is stable
  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  for (auto* op_desc : ops) {
-    auto outputs = op_desc->Outputs();
+    try {
-    for (auto& o_it : outputs) {
+      bool is_bk_op =
-      for (auto& v : o_it.second) {  // values
+          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-        vars[v] = order;
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+      auto backward_vars =
+          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+      auto outputs = op_desc->Outputs();
+      for (auto& o_it : outputs) {
+        for (auto& v : o_it.second) {  // values
+          vars[v] = order;
+          VLOG(1) << "in all_reduce_deps_pass:" << v;
+        }
      }
+      order++;
+    } catch (boost::bad_get e) {
    }
-    order++;
  }
  std::vector<OpHandleBase*> dist_ops;

--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -22,14 +22,9 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
 namespace paddle {
 namespace framework {
 namespace details {
@@ -206,8 +201,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    }
  }
-  op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
+  double memory_fraction = framework::GetEagerDeletionMemoryFraction();
-                             FLAGS_memory_fraction_of_eager_deletion);
+  op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction);
  for (auto &pair : op_vars_map) {
    auto *op = pair.first;
@@ -239,8 +235,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    eager_deletion_op->AddOutput(dummy_leaf);
  }
-  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
+  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
-           << FLAGS_memory_fraction_of_eager_deletion;
  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
  auto while_op_eager_deletion_pass =

--- a/paddle/fluid/framework/details/early_delete_op_handle.h
+++ b/paddle/fluid/framework/details/early_delete_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-namespace paddle {
-namespace framework {
-namespace details {
-class EarlyDeleteOpHandle : public OpHandleBase {
- public:
-  EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
-                      const platform::Place& place,
-                      const std::vector<std::string>& names,
-                      GarbageCollector* gc)
-      : OpHandleBase(node),
-        scope_(scope),
-        place_(place),
-        names_(names),
-        gc_(gc) {
-#ifdef PADDLE_WITH_CUDA
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-#endif
-  }
-  ~EarlyDeleteOpHandle() {
-#ifdef PADDLE_WITH_CUDA
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-#endif
-  }
-  std::string Name() const override { return "early_delete"; }
- protected:
-  void RunImpl() override {
-    std::vector<std::shared_ptr<memory::Allocation>> tensors;
-    auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
-    for (auto& var_name : names_) {
-      auto* var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE(var != nullptr,
-                     string::Sprintf("Local Scope not has var %s", var_name));
-      if (var->IsType<LoDTensor>()) {
-        tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-      } else if (var->IsType<SelectedRows>()) {
-        tensors.emplace_back(var->GetMutable<SelectedRows>()
-                                 ->mutable_value()
-                                 ->MoveMemoryHolder());
-      } else if (var->IsType<LoDTensorArray>()) {
-        LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
-        for (auto& tensor : *tensor_array) {
-          tensors.emplace_back(tensor.MoveMemoryHolder());
-        }
-      }
-    }
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
- private:
-  void ClearTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-    if (platform::is_cpu_place(place_)) {
-      ClearCPUTensors(tensors);
-    } else {
-      ClearGPUTensors(tensors);
-    }
-  }
-  void ClearCPUTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-    auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
-    if (gc != nullptr) {
-      gc->Add(tensors);
-    }
-  }
-  void ClearGPUTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-#ifdef PADDLE_WITH_CUDA
-    auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
-#endif
-  }
-  const Scope* scope_;
-  const platform::Place place_;
-  std::vector<std::string> names_;
-  GarbageCollector* gc_;
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDADeviceContext* dev_ctx_;
-  cudaEvent_t event_;
-#endif
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
+#include <string>
+namespace paddle {
+namespace framework {
+namespace details {
+FetchBarrierOpHandle::FetchBarrierOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    // fetch_barrier op always run on place0, but output on all places.
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(*node->Op())),
+      local_scopes_(local_scopes),
+      places_(places),
+      run_scope_(local_scopes[0]),
+      place_(places[0]) {
+  for (auto &p : places) {
+    this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p));
+  }
+}
+bool FetchBarrierOpHandle::IsMultiDeviceTransfer() {
+  // override IsMultiDeviceTransfer to return true
+  return true;
+}
+void FetchBarrierOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+  auto run_func = [this]() {
+    op_->Run(*run_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  };
+  if (is_lock_and_record_event_free_) {
+    run_func();
+  } else {
+    this->RunAndRecordEvent(run_func);
+  }
+}
+bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) {
+  bool need_wait =
+      in_var && in_var->GeneratedOp() &&
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
+  return need_wait;
+}
+std::string FetchBarrierOpHandle::Name() const { return op_->Type(); }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+// **NOTE**: fetch_barrier op is special it outputs all recved variables on
+// all places if there are multiple places, must init with
+// multiple dev_ctxes_ !!!!
+struct FetchBarrierOpHandle : public OpHandleBase {
+ public:
+  FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
+                       const std::vector<platform::Place> &places);
+  bool IsMultiDeviceTransfer() override;
+  std::string Name() const override;
+ protected:
+  void RunImpl() override;
+  bool NeedWait(VarHandleBase *in_var) override;
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  Scope *run_scope_;
+  platform::Place place_;
+  bool is_lock_and_record_event_free_{false};
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -17,6 +17,8 @@
 #include <deque>
 #include <iterator>
 #include <memory>
+#include <queue>
+#include <sstream>
 #include <stack>
 #include <string>
 #include <unordered_map>
@@ -148,12 +150,14 @@ std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
  view_.Build(graph.get());
  InitSSAGraphNodes();
+  auto cnt = 0;
  for (auto* op : view_.AllOps()) {
+    VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
    if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
      continue;
    TryInplaceOpInputOutput(op, graph.get());
  }
-  graph->ResolveHazard(var_nodes_);
+  // graph->ResolveHazard(var_nodes_);
  return graph;
 }
@@ -264,13 +268,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                          ir::Graph* graph) const {
  VLOG(4) << "Try to inplace op " << op->Name();
-  // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
+  // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-  // ProgramDescs.
+  //               "op_desc is nullptr");
-  // The operations related to BlockDesc or ProgramDesc should perform on Graph
-  // or Node directly!
-  PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-                 "op_desc is nullptr");
  // some pre-requirments need to meet if the op want to inplaced.
+  PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
  auto* op_desc = op->Op();
  auto& infer_inplace =
@@ -281,21 +282,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
  PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
                 "%s's infer_inplace has not been registered", op_desc->Type());
-  auto* block = op_desc->Block();
+  auto in_to_outs = infer_inplace(*op_desc);
-  auto in_to_outs = infer_inplace(*op_desc, block);
  auto& all_ops = view_.AllOps();
  auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
  size_t idx = std::distance(all_ops.begin(), cursor);
  for (auto& pair : in_to_outs) {
-    auto& in_var_name = pair.first;
+    auto& in_para_name = pair.first;
-    auto& out_var_name = pair.second;
+    auto& out_para_name = pair.second;
+    auto input_vars = op->Op()->Input(in_para_name);
+    if (!input_vars.size()) {
+      VLOG(4) << "Parameter " << in_para_name << " is empty skip "
+              << in_para_name << " => " << out_para_name << " pair";
+      continue;
+    }
+    auto output_vars = op->Op()->Output(out_para_name);
+    if (!output_vars.size()) {
+      VLOG(4) << "Parameter " << out_para_name << " is empty skip "
+              << in_para_name << " => " << out_para_name << " pair";
+      continue;
+    }
+    auto in_var_name = input_vars.at(0);
+    auto out_var_name = output_vars.at(0);
    auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
    auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
+    VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
+    bool can_replace = true;
+    if (in_var_name == out_var_name) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable "
+              << out_var_name << " are the same";
+    } else if (!NodeCanReused(in_node)) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
+    } else if (!NodeCanReused(out_node)) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Output variable " << out_var_name
+              << " cannot be reused";
+    } else if (details::NodeSize(*in_node->Var()) !=
+               details::NodeSize(*out_node->Var())) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input and Output varialbe size not match";
+    }
+    if (!can_replace) continue;
    // 2. there is no external pending op on the input node
-    if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    // if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
      VLOG(4) << string::Sprintf(
          "Skiped pair %s => %s. %s input has external dependency."
          "inplace such pair will overwrite the memory.",
@@ -342,6 +380,97 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
  }
 }
+void GraphView::TopoSort(ir::Graph* graph) {
+  //
+  ops_.clear();
+  auto deps_num = [](ir::Node* op) {
+    auto cnt = 0;
+    for (auto& var : op->inputs)
+      if (var->inputs.size() > 0) ++cnt;
+    return cnt;
+  };
+  std::queue<std::pair<ir::Node*, uint32_t>> ready_ops;
+  int level = 0;
+  auto nodes = graph->Nodes();
+  std::unordered_map<ir::Node*, uint32_t> deps_map;
+  for (auto& node : nodes) {
+    if (node->IsOp() && node->Op() != nullptr) {
+      deps_map[node] = deps_num(node);
+      if (0 == deps_map[node]) {
+        ready_ops.push({node, level});
+      }
+    }
+  }
+  while (!ready_ops.empty()) {
+    auto item = ready_ops.front();
+    ready_ops.pop();
+    ops_.emplace_back(item.first);
+    // record level when pop from queue
+    op_level_[item.first] = item.second;
+    for (auto node : item.first->outputs) {
+      for (auto op : node->outputs) {
+        --deps_map[op];
+        if (deps_map[op] == 0) ready_ops.push({op, item.second + 1});
+      }
+    }
+  }
+  bool all_ops_checked = true;
+  for (auto& node : nodes) {
+    if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
+      all_ops_checked = false;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis");
+}
+// return true if current op node depeneds on all other op that use the same
+// variable node
+bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
+  // get op list that rely on the same variable
+  auto op_list = var->outputs;
+  for (auto& op : op_list) {
+    if (op == current_op) continue;
+    VLOG(4) << "    GraphView::CheckDeps : " << op->Name() << "  & "
+            << current_op->Name();
+    if (!CheckOpDeps(op, current_op)) return false;
+    VLOG(4) << "";
+  }
+  return true;
+}
+// check if op2 depends on op1's output
+bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
+  auto print_op = [&](ir::Node* op, const char* name) {
+    std::ostringstream os;
+    os << "        " << name << " : " << op->Name() << " ";
+    os << "Input args : ";
+    for (auto& arg : op->inputs) os << arg->Name() << " ";
+    os << "Output args : ";
+    for (auto& arg : op->outputs) os << arg->Name() << " ";
+    os << "Level : " << op_level_.at(op);
+    VLOG(4) << os.str();
+  };
+  print_op(op1, "OP1");
+  print_op(op2, "OP2");
+  if (op1 == op2) return true;
+  if (op_level_.at(op1) >= op_level_.at(op2)) return false;
+  for (auto& var : op2->inputs)
+    if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true;
+  return false;
+}
 ir::Node* GraphView::GetNodeByName(const std::string& name,
                                   const std::vector<ir::Node*>& nodes) const {
  // nodes should be op->inputs/outputs
@@ -387,22 +516,7 @@ void GraphView::Build(ir::Graph* g) {
  // Because we insert some new created node. Which may have data race between
  // nodes.
  // resolve data harzards depends on the var nodes in right order.
-  ops_ = SortOpLikeDescOrder(*g);
+  TopoSort(g);
-  // 1. track the nodes which reused previous node in Python memory optimize.
-  // these node can not be inplaced, otherwise may generate a circle in graph.
-  std::unordered_set<std::string> all_vars;
-  for (auto& node : g->Nodes()) {
-    if (node->IsVar()) continue;
-    for (auto& out : node->outputs) {
-      if (out->IsCtrlVar() || out->Var() == nullptr) continue;
-      if (all_vars.count(out->Name())) {
-        dup_nodes_.emplace(out->Name());
-      } else {
-        all_vars.emplace(out->Name());
-      }
-    }
-  }
  // 2. track the nodes which used by parameter server.
  // these node can not be inplaced, otherwise trainer

--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -14,6 +14,7 @@
 #pragma once
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -50,10 +51,15 @@ class GraphView {
  // map the parameter and gradient, must be skipped.
  bool InSkipSet(const std::string& var) const;
+  bool CheckDeps(ir::Node* var, ir::Node* current_op) const;
+  bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const;
+  void TopoSort(ir::Graph* g);
 private:
  std::vector<ir::Node*> ops_;
  std::unordered_set<std::string> dup_nodes_;  // mem opt affect nodes
  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
+  std::unordered_map<ir::Node*, uint32_t> op_level_;
 };
 // swap pairs in sequence

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -190,7 +190,7 @@ struct NodeComparator {
    auto rhs_shape = rhs_desc->GetShape();
    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
-      return NodeSize(lhs) <= NodeSize(rhs);
+      return NodeSize(lhs) == NodeSize(rhs);
    } else {
      return false;
    }
@@ -449,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() {
      live_in_[op].insert(var);
    }
    for (auto& var : defs_[op]) {
+      if (uses_[op].count(var)) continue;
      live_in_[op].erase(var);
    }

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -142,15 +142,16 @@ TEST(OrderedSet, FindBestFitNode) {
  for (auto& node : nodes) {
    pool.Insert(node.get());
  }
+  // FIXME(liuwei1031) this API has changed,
+  // disable these tests temporarily
  // FindNextBestFitNode
-  auto* n = nodes[0].get();
+  // auto* n = nodes[0].get();
-  auto* cache = pool.FindBestFitNode(n);
+  // auto* cache = pool.FindBestFitNode(n);
-  PADDLE_ENFORCE(cache->Name() == "a");
+  // PADDLE_ENFORCE(cache->Name() == "a");
-  cache = pool.FindNextBestFitNode(n, cache);
+  // cache = pool.FindNextBestFitNode(n, cache);
-  PADDLE_ENFORCE(cache->Name() == "c");
+  // PADDLE_ENFORCE(cache->Name() == "c");
-  cache = pool.FindNextBestFitNode(n, cache);
+  // cache = pool.FindNextBestFitNode(n, cache);
-  PADDLE_ENFORCE(cache->Name() == "b");
+  // PADDLE_ENFORCE(cache->Name() == "b");
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -851,9 +852,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                 node->Op()->Type());
-  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
-      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+  // Create fetch_barrier op handle to enable output on all devices.
-      node->Op()->Type(), places_[op_dev_id]));
+  // **NOTE** fetch_barrier should output variables list same as recv op does.
+  if (node->Op()->Type() == "fetch_barrier") {
+    result->Get<GraphOps>(kGraphOps).emplace_back(new FetchBarrierOpHandle(
+        result->CreateOpNode(node->Op()), local_scopes_, places_));
+  } else {
+    result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
+        result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+        node->Op()->Type(), places_[op_dev_id]));
+  }
  if (node->Op()->Type() == "send") {
    CreateOpHandleIOs(result, node, op_dev_id);

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -55,7 +55,7 @@ void OpHandleBase::Run(bool use_cuda) {
        if (out_var_handle) {
          int dev_id =
              boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
-          out_var_handle->SetGenerateEvent(events_[dev_id]);
+          out_var_handle->SetGenerateEvent(events_.at(dev_id));
        }
      }
    } else {
@@ -71,7 +71,7 @@ void OpHandleBase::Run(bool use_cuda) {
              "The place of input(%s) is not consistent with the "
              "place of current op(%s).",
              out_var_handle->Name(), Name());
-          out_var_handle->SetGenerateEvent(events_[dev_id]);
+          out_var_handle->SetGenerateEvent(events_.at(dev_id));
        }
      }
    }

--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
@@ -36,27 +37,86 @@ enum OpInfoFillType {
  kGradOpDescMaker = 2,
  kVarTypeInference = 3,
  kShapeInference = 4,
-  kInplaceOpInference = 5
+  kInplaceOpInference = 5,
+  kNoNeedBufferVarsInference = 6,
+  kUnknown = -1
 };
+namespace internal {
+template <typename T, OpInfoFillType kType>
+struct TypePair {
+  using Type = T;
+  static constexpr OpInfoFillType kFillType = kType;
+};
+using OpRegistryClasses = std::tuple<                                // NOLINT
+    TypePair<OperatorBase, kOperator>,                               // NOLINT
+    TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>,       // NOLINT
+    TypePair<GradOpDescMakerBase, kGradOpDescMaker>,                 // NOLINT
+    TypePair<VarTypeInference, kVarTypeInference>,                   // NOLINT
+    TypePair<InferShapeBase, kShapeInference>,                       // NOLINT
+    TypePair<InplaceOpInference, kInplaceOpInference>,               // NOLINT
+    TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference>  // NOLINT
+    >;
+static constexpr int kOpRegistryClassNumber =
+    std::tuple_size<OpRegistryClasses>::value;
+template <typename T, int kPos, bool kIsBounded /* = true*/>
+struct IsMatchedBaseTypeImpl {
+  using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type;
+  static constexpr bool kValue =
+      std::is_base_of<typename PairType::Type, T>::value;
+};
+template <typename T, int kPos>
+struct IsMatchedBaseTypeImpl<T, kPos, false> {
+  static constexpr bool kValue = false;
+};
+template <typename T, int kPos>
+static inline constexpr bool IsMatchedBaseType() {
+  return IsMatchedBaseTypeImpl<
+      T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue;
+}
+template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched>
+struct OpInfoFillTypeGetterImpl {};
+// This case should not happen
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> {
+  static constexpr OpInfoFillType kType = kUnknown;
+};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> {
+  static constexpr OpInfoFillType kType =
+      OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd,
+                               IsMatchedBaseType<T, kStart + 1>()>::kType;
+};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> {
+  using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type;
+  static constexpr OpInfoFillType kType = PairType::kFillType;
+};
+template <typename T>
+using OpInfoFillTypeGetter =
+    OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber,
+                             kOpRegistryClassNumber == 0,
+                             IsMatchedBaseType<T, 0>()>;
+}  // namespace internal
 template <typename T>
 struct OpInfoFillTypeID {
  static constexpr OpInfoFillType ID() {
-    return std::is_base_of<OperatorBase, T>::value
+    return internal::OpInfoFillTypeGetter<T>::kType;
-               ? kOperator
-               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
-                      ? kOpProtoAndCheckerMaker
-                      : (std::is_base_of<GradOpDescMakerBase, T>::value
-                             ? kGradOpDescMaker
-                             : (std::is_base_of<VarTypeInference, T>::value
-                                    ? kVarTypeInference
-                                    : (std::is_base_of<InferShapeBase, T>::value
-                                           ? kShapeInference
-                                           : (std::is_base_of<
-                                                  InplaceOpInference, T>::value
-                                                  ? kInplaceOpInference
-                                                  : static_cast<OpInfoFillType>(
-                                                        -1))))));
  }
 };
@@ -149,9 +209,21 @@ struct OpInfoFiller<T, kShapeInference> {
 template <typename T>
 struct OpInfoFiller<T, kInplaceOpInference> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) {
+    info->infer_inplace_ = [](const OpDesc& op_desc) {
      T infer;
-      return infer(op_desc, block);
+      return infer(op_desc);
+    };
+  }
+};
+template <typename T>
+struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs,
+                                          const VariableNameMap& outputs,
+                                          const AttributeMap& attrs) {
+      T infer(inputs, outputs, attrs);
+      return infer();
    };
  }
 };

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -193,6 +193,79 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
  return shrink_func(computation_op);
 }
+/**
+ * Shrink op dependencies according to no need buffer vars.
+ *
+ * If some ops do not need Tensor buffer of any input,
+ * just remove the dependency of this op, i.e, decrease reference count.
+ *
+ * For example, input Y of elementwise_add_grad op is only used to infer shape
+ * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
+ * input Y can be collected before elementwise_add_grad op runs.
+ *
+ * This method returns whether the dependency count decreases to 0, and
+ * shrinks op dependency if possible.
+ */
+static bool ShrinkNoNeedBufferVarOpDependency(
+    const std::string &var_name,
+    std::unordered_set<ComputationOpHandle *> *op_handles) {
+  std::vector<ComputationOpHandle *> skip_ops;
+  for (auto *op_handle : *op_handles) {
+    auto *op_base = op_handle->GetOp();
+    auto &inferer = op_base->Info().NoNeedBufferVarsInferer();
+    if (!inferer) {
+      continue;
+    }
+    std::unordered_set<std::string> no_need_buffer_vars =
+        inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
+    // Check whether var_name occurs in other inputs or outputs of the op
+    // If it occurs, we cannot decrease the dependency number.
+    bool occurred_in_other_vars = false;
+    for (auto &in_pair : op_base->Inputs()) {
+      if (no_need_buffer_vars.count(in_pair.first) > 0) {
+        continue;
+      }
+      auto &args = in_pair.second;
+      auto iter = std::find(args.begin(), args.end(), var_name);
+      if (iter != args.end()) {
+        occurred_in_other_vars = true;
+        break;
+      }
+    }
+    if (occurred_in_other_vars) {
+      continue;
+    }
+    for (auto &out_pair : op_base->Outputs()) {
+      auto &args = out_pair.second;
+      auto iter = std::find(args.begin(), args.end(), var_name);
+      if (iter != args.end()) {
+        occurred_in_other_vars = true;
+        break;
+      }
+    }
+    if (!occurred_in_other_vars) {
+      VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name();
+      skip_ops.emplace_back(op_handle);
+    }
+  }
+  if (skip_ops.size() == op_handles->size()) {
+    op_handles->clear();
+    return true;
+  } else {
+    for (auto *skip_op : skip_ops) {
+      op_handles->erase(skip_op);
+    }
+    return false;
+  }
+}
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
@@ -229,17 +302,43 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
        continue;
      }
-      bool ok;
+      auto &var_name = name_var_pair.first;
-      auto result = ExtractComputationOpFromLastLivedVar(
+      auto &var_handles = name_var_pair.second;
-          name_var_pair.second.back(), i, shrink_func, &ok);
+      for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
+           ++iter) {
+        bool ok;
+        auto result =
+            ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok);
+        // Seldomly, some vars may have no pending or preceding computation ops
+        // Just break;
+        if (!ok) break;
+        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
+        size_t original_op_deps = result.size();
+        // If all ops do not need buffer of var_name, calculate reference count
+        // of the previous version of var_name.
+        if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) {
+          VLOG(10) << "Try to precede reference count computing at var "
+                   << var_name;
+          continue;
+        }
+        size_t final_op_deps = result.size();
+        if (final_op_deps < original_op_deps) {
+          VLOG(5) << "Shrink op deps from " << original_op_deps << " to "
+                  << final_op_deps;
+        }
-      if (ok) {
-        auto &var_name = name_var_pair.first;
        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
                       var_name);
        ref_cnts[i].emplace(var_name, result.size());
        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
      }
+      // Seldomly, all preceding trying failed.
+      // Just skip this corner case
    }
  }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -48,97 +49,23 @@ namespace {
 int kProgramId = -1;
 }  // namespace
-static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
-    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
-  std::unordered_map<std::string, size_t> ref_cnts;
-  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
-                                            skip_var_list.end());
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        if (skip_vars.count(name)) continue;
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS &&
-            type != proto::VarType::LOD_TENSOR_ARRAY) {
-          continue;
-        }
-        ++ref_cnts[name];
-      }
-    }
-  };
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id,
+    const framework::ProgramDesc& prog, size_t block_id)
-    const std::vector<std::string>& keep_vars, bool force_disable_gc)
+    : prog_(prog), block_id_(block_id) {}
-    : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
-  if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
+void ExecutorPrepareContext::PrepareUnusedVars(
-    global_ref_cnts_ =
+    const std::vector<std::string>& keep_vars, bool force_disable_gc) {
-        GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
+  force_disable_gc_ = force_disable_gc;
+  if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) {
+    return;
  }
+  unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars);
 }
 ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
-static void DeleteUnusedTensors(
-    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
-    std::unordered_map<std::string, size_t>* ref_cnts) {
-  std::deque<std::shared_ptr<memory::Allocation>> garbages;
-  auto handler = [&](const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto it = ref_cnts->find(name);
-        if (it == ref_cnts->end()) continue;
-        if (--(it->second) != 0) {
-          continue;
-        }
-        auto* var = scope.FindVar(name);
-        if (var == nullptr) {
-          continue;
-        }
-        VLOG(2) << "Erase variable " << name;
-        if (var->IsType<LoDTensor>()) {
-          garbages.emplace_back(
-              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-        } else if (var->IsType<SelectedRows>()) {
-          garbages.emplace_back(var->GetMutable<SelectedRows>()
-                                    ->mutable_value()
-                                    ->MoveMemoryHolder());
-        } else if (var->IsType<LoDTensorArray>()) {
-          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-          for (auto& t : *lod_tensor_arr) {
-            garbages.emplace_back(t.MoveMemoryHolder());
-          }
-        } else {
-          PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       framework::ToTypeName(var->Type()), name);
-        }
-      }
-    }
-  };
-  handler(op->Inputs());
-  handler(op->Outputs());
-  if (!garbages.empty()) {
-    gc->Add(std::move(garbages));
-  }
-}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -362,8 +289,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id,
    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
+  std::unique_ptr<ExecutorPrepareContext> ctx(
-      program, block_id, skip_ref_cnt_vars, force_disable_gc));
+      new ExecutorPrepareContext(program, block_id));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@@ -375,6 +302,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
  }
 #endif
+  ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc);
  return ctx;
 }
@@ -389,19 +317,17 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
  size_t idx = 0;
  for (auto& bid : block_ids) {
-    ExecutorPrepareContext* ctx;
-    if (skip_ref_cnt_vars.empty()) {
-      ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
-                                       force_disable_gc);
-    } else {
-      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
-                                       force_disable_gc);
-    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    auto* ctx = new ExecutorPrepareContext(program, bid);
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
+    if (skip_ref_cnt_vars.empty()) {
+      ctx->PrepareUnusedVars(std::vector<std::string>(), force_disable_gc);
+    } else {
+      ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc);
+    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
    ++idx;
  }
@@ -425,7 +351,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  // FIXME(zjl): recurrent_op is rather complex, we would
  // disable gc forcely in recurrent_op
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
      if (IsFastEagerDeletionModeEnabled()) {
@@ -453,8 +378,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    op->Run(*local_scope, place_);
    if (gc) {
-      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+      DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get());
-                          &(ctx->runtime_ref_cnts_));
    }
  }

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -30,22 +30,20 @@ namespace paddle {
 namespace framework {
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
-                         const std::vector<std::string>& skip_ref_cnt_vars =
-                             std::vector<std::string>(),
-                         bool force_disable_gc = false);
  ~ExecutorPrepareContext();
-  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
+  void PrepareUnusedVars(const std::vector<std::string>& keep_vars,
+                         bool force_disable_gc = false);
  const framework::ProgramDesc& prog_;
-  size_t block_id_;
+  const size_t block_id_;
-  bool force_disable_gc_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;
-  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<OperatorBase*, std::vector<std::string>> unused_vars_;
-  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
+  bool force_disable_gc_{false};
 };
 class Executor {

--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include <deque>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+struct OpInOutInfo {
+ public:
+  void Build(const OperatorBase *op) {
+    is_built_ = true;
+    auto &inferer = op->Info().NoNeedBufferVarsInferer();
+    if (inferer) {
+      no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
+      if (no_need_buffer_ins_.empty()) return;
+      for (auto &in_name_pair : op->Inputs()) {
+        if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
+          continue;
+        }
+        for (auto &in_arg_name : in_name_pair.second) {
+          other_args_set_.insert(in_arg_name);
+        }
+      }
+      for (auto &out_name_pair : op->Outputs()) {
+        for (auto &out_arg_name : out_name_pair.second) {
+          other_args_set_.insert(out_arg_name);
+        }
+      }
+    }
+  }
+  bool IsBuilt() const { return is_built_; }
+  bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
+    return no_need_buffer_ins_.empty() ||
+           other_args_set_.count(in_arg_name) != 0;
+  }
+ private:
+  // A set to record unused buffer input vars of op
+  std::unordered_set<std::string> no_need_buffer_ins_;
+  // A set to record other args of op (including in, out)
+  std::unordered_set<std::string> other_args_set_;
+  bool is_built_{false};
+};
+static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
+                            const std::unordered_set<std::string> &skip_vars) {
+  if (skip_vars.count(name) != 0) {
+    return false;
+  }
+  auto *var_desc = block.FindVar(name);
+  if (var_desc == nullptr || var_desc->Persistable()) {
+    return false;
+  }
+  auto type = var_desc->Proto()->type().type();
+  return type == proto::VarType::LOD_TENSOR ||
+         type == proto::VarType::SELECTED_ROWS ||
+         type == proto::VarType::LOD_TENSOR_ARRAY;
+}
+std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
+    const BlockDesc &block,
+    const std::vector<std::unique_ptr<OperatorBase>> &ops,
+    const std::vector<std::string> &skip_var_list) {
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+  std::unordered_map<std::string, size_t> var_op_idx_map;
+  for (size_t i = 0; i < ops.size(); ++i) {
+    auto *op = ops[i].get();
+    OpInOutInfo info;
+    for (auto &name_pair : op->Inputs()) {
+      for (auto &name : name_pair.second) {
+        if (!VarCanBeDeleted(name, block, skip_vars)) {
+          continue;
+        }
+        // var can be gc-ed
+        if (!info.IsBuilt()) {
+          info.Build(op);
+        }
+        if (info.IsInArgBufferNeeded(name)) {
+          // Update the last living op of variable to current op
+          var_op_idx_map[name] = i;
+        } else {
+          VLOG(10) << "Skip reference count computing of variable "
+                   << name_pair.first << "(" << name << ") in Operator "
+                   << op->Type();
+        }
+      }
+    }
+    for (auto &name_pair : op->Outputs()) {
+      for (auto &name : name_pair.second) {
+        if (VarCanBeDeleted(name, block, skip_vars)) {
+          // Update the last living op of variable to current op
+          var_op_idx_map[name] = i;
+        }
+      }
+    }
+  }
+  std::unordered_map<OperatorBase *, std::vector<std::string>> result;
+  for (auto &name_op_idx_pair : var_op_idx_map) {
+    auto &name = name_op_idx_pair.first;
+    size_t op_idx = name_op_idx_pair.second;
+    result[ops[op_idx].get()].emplace_back(name);
+  }
+  return result;
+}
+void DeleteUnusedTensors(
+    const Scope &scope, OperatorBase *op,
+    const std::unordered_map<OperatorBase *, std::vector<std::string>>
+        &delete_vars_map,
+    GarbageCollector *gc) {
+  auto iter = delete_vars_map.find(op);
+  if (iter == delete_vars_map.end()) {
+    return;
+  }
+  auto &delete_vars = iter->second;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &var_name : delete_vars) {
+    auto *var = scope.FindVar(var_name);
+    if (var == nullptr) {
+      continue;
+    }
+    VLOG(2) << "Erase variable " << var_name;
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *lod_tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   framework::ToTypeName(var->Type()), var_name);
+    }
+  }
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+// Result map: op -> variable names that can be deleted after op runs
+std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
+    const BlockDesc &block,
+    const std::vector<std::unique_ptr<OperatorBase>> &ops,
+    const std::vector<std::string> &skip_vars);
+// Collect unused tensors after op runs
+void DeleteUnusedTensors(
+    const Scope &scope, OperatorBase *op,
+    const std::unordered_map<OperatorBase *, std::vector<std::string>>
+        &delete_vars_map,
+    GarbageCollector *gc);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,14 +13,36 @@
 // limitations under the License.
 #include <algorithm>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 namespace paddle {
 namespace framework {
+DEFINE_double(
+    eager_delete_tensor_gb, -1.0,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                   size_t max_memory_size)
    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
@@ -85,5 +107,25 @@ void StreamGarbageCollector::ClearCallback(
  callback_manager_->AddCallback(callback);
 }
 #endif
+int64_t GetEagerDeletionThreshold() {
+  return FLAGS_eager_delete_tensor_gb < 0
+             ? -1
+             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
+                                    (static_cast<int64_t>(1) << 30));
+}
+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) {
+  FLAGS_eager_delete_tensor_gb = threshold;
+  FLAGS_memory_fraction_of_eager_deletion = fraction;
+  FLAGS_fast_eager_deletion_mode = fast_mode;
+}
+double GetEagerDeletionMemoryFraction() {
+  return FLAGS_memory_fraction_of_eager_deletion;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -18,6 +18,8 @@
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <utility>
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
@@ -126,5 +128,12 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) {
  }
 }
+int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();
+void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode);
+double GetEagerDeletionMemoryFraction();
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -17,8 +17,8 @@
 #include <numeric>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -32,55 +32,22 @@ namespace framework {
  then Out will inplaced use X's memory. The base class will do
  legality validation for both variables.
 */
 class InplaceOpInference {
 public:
  virtual ~InplaceOpInference() {}
  virtual std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const = 0;
+      const OpDesc& op_desc) const = 0;
-};
-class InplaceInToOut : public InplaceOpInference {
- public:
-  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const {
-    std::unordered_map<std::string, std::string> ret;
-    auto in_out_var_names_pair = this->Apply(op_desc, block);
-    for (auto& pair : in_out_var_names_pair) {
-      PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(),
-                     string::Sprintf("op %s do not have input of %s!",
-                                     op_desc.Type(), pair.first));
-      PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(),
-                     string::Sprintf("op %s do not have output of %s!",
-                                     op_desc.Type(), pair.second));
-      auto& in_name = op_desc.Input(pair.first).at(0);
-      auto& out_name = op_desc.Output(pair.second).at(0);
-      auto in = block->FindRecursiveOrCreateVar(in_name);
-      auto out = block->FindRecursiveOrCreateVar(out_name);
-      if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name});
-    }
-    return ret;
-  }
- protected:
-  virtual std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const = 0;
-  bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
-    return in.Name() != out.Name() && details::NodeCanReused(in) &&
-           details::NodeCanReused(out) &&
-           details::NodeSize(out) <= details::NodeSize(in);
-  }
 };
 /*
  Inplace In and Out for operator only have an Input and an Output.
  For example, activation op.
 */
-class SingleOpInplaceInToOut : public InplaceInToOut {
+class SingleOpInplaceInToOut : public InplaceOpInference {
- protected:
+ public:
-  std::unordered_map<std::string, std::string> Apply(
+  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const override {
+      const OpDesc& op_desc) const override {
    PADDLE_ENFORCE(!op_desc.InputNames().empty(),
                   "Op inputs must not be empty");
    PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
@@ -95,10 +62,10 @@ class SingleOpInplaceInToOut : public InplaceInToOut {
  Gradient op. Inplace output use it's Input.
  For example, Input@Grad->Input reuse strategy.
 */
-class GradOpInplaceInToOut : public InplaceInToOut {
+class GradOpInplaceInToOut : public InplaceOpInference {
- protected:
+ public:
-  std::unordered_map<std::string, std::string> Apply(
+  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const override {
+      const OpDesc& op_desc) const override {
    std::unordered_map<std::string, std::string> ret;
    std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
                                                 op_desc.OutputNames().end());

--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -127,26 +127,20 @@ class MultiOutGradShapeInference : public framework::InferShapeBase {
  }
 };
-class MultiOutInplaceInToOut : public framework::InplaceInToOut {
+class MultiOutInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"},
    };
  }
 };
-class MultiOutGradInplaceInToOut : public framework::InplaceInToOut {
+class MultiOutGradInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {framework::GradVarName("YOut"), framework::GradVarName("Y")},
        {framework::GradVarName("Out"), framework::GradVarName("X")},
@@ -171,118 +165,118 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
 namespace paddle {
 namespace framework {
-TEST(InferInplace, SingleOpInplaceInToOut) {
+// TEST(InferInplace, SingleOpInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("single_op");
+//   op->SetType("single_op");
-  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+//   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-  op->SetOutput("Out", {"test2_out"});
+//   op->SetOutput("Out", {"test2_out"});
+//
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
+//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_out");
+//   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 1ul);
+//   EXPECT_EQ(in_to_outs.size(), 1ul);
-  auto it = in_to_outs.begin();
+//   auto it = in_to_outs.begin();
-  EXPECT_EQ(it->first, "test2_a");
+//   EXPECT_EQ(it->first, "test2_a");
-  EXPECT_EQ(it->second, "test2_out");
+//   EXPECT_EQ(it->second, "test2_out");
-}
+// }
+//
-TEST(InferInplace, SingleGradOpInplaceInToOut) {
+// TEST(InferInplace, SingleGradOpInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("single_op_grad");
+//   op->SetType("single_op_grad");
-  op->SetInput(GradVarName("Out"), {"test2_out"});
+//   op->SetInput(GradVarName("Out"), {"test2_out"});
-  op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+//   op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+//
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_out");
+//   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 1ul);
+//   EXPECT_EQ(in_to_outs.size(), 1ul);
-  auto it = in_to_outs.begin();
+//   auto it = in_to_outs.begin();
-  EXPECT_EQ(it->first, "test2_out");
+//   EXPECT_EQ(it->first, "test2_out");
-  EXPECT_EQ(it->second, "test2_a");
+//   EXPECT_EQ(it->second, "test2_a");
-}
+// }
+//
-TEST(InferInplace, MultiOutInplaceInToOut) {
+// TEST(InferInplace, MultiOutInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("multi_out_op");
+//   op->SetType("multi_out_op");
-  op->SetInput("X", {"a0", "a1"});
+//   op->SetInput("X", {"a0", "a1"});
-  op->SetInput("Y", {"b0"});
+//   op->SetInput("Y", {"b0"});
-  op->SetInput("Z", {"c0", "c1"});
+//   op->SetInput("Z", {"c0", "c1"});
-  op->SetOutput("Out", {"o0"});
+//   op->SetOutput("Out", {"o0"});
-  op->SetOutput("YOut", {"y0"});
+//   op->SetOutput("YOut", {"y0"});
-  op->SetOutput("ZOut", {"z0"});
+//   op->SetOutput("ZOut", {"z0"});
+//
-  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("o0");
+//   prog.MutableBlock(0)->Var("o0");
-  prog.MutableBlock(0)->Var("y0");
+//   prog.MutableBlock(0)->Var("y0");
-  prog.MutableBlock(0)->Var("z0");
+//   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 3ul);
+//   EXPECT_EQ(in_to_outs.size(), 3ul);
-  std::unordered_map<std::string, std::string> expects = {
+//   std::unordered_map<std::string, std::string> expects = {
-      {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
+//       {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
-  };
+//   };
-  EXPECT_TRUE(expects == in_to_outs);
+//   EXPECT_TRUE(expects == in_to_outs);
-}
+// }
+//
-TEST(InferInplace, MultiGradInplaceInToOut) {
+// TEST(InferInplace, MultiGradInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("multi_out_grad");
+//   op->SetType("multi_out_grad");
-  op->SetInput(GradVarName("Out"), {"o0"});
+//   op->SetInput(GradVarName("Out"), {"o0"});
-  op->SetInput(GradVarName("YOut"), {"y0"});
+//   op->SetInput(GradVarName("YOut"), {"y0"});
-  op->SetInput(GradVarName("ZOut"), {"z0"});
+//   op->SetInput(GradVarName("ZOut"), {"z0"});
-  op->SetOutput(GradVarName("X"), {"a0", "a1"});
+//   op->SetOutput(GradVarName("X"), {"a0", "a1"});
-  op->SetOutput(GradVarName("Y"), {"b0"});
+//   op->SetOutput(GradVarName("Y"), {"b0"});
-  op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+//   op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+//
-  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("o0");
+//   prog.MutableBlock(0)->Var("o0");
-  prog.MutableBlock(0)->Var("y0");
+//   prog.MutableBlock(0)->Var("y0");
-  prog.MutableBlock(0)->Var("z0");
+//   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
+//
-  EXPECT_EQ(in_to_outs.size(), 3ul);
+//   EXPECT_EQ(in_to_outs.size(), 3ul);
-  std::unordered_map<std::string, std::string> expects = {
+//   std::unordered_map<std::string, std::string> expects = {
-      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
+//       {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
-  };
+//   };
-  EXPECT_TRUE(expects == in_to_outs);
+//   EXPECT_TRUE(expects == in_to_outs);
-}
+// }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,9 +46,6 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
-pass_library(cpu_quantize_placement_pass base)
-pass_library(cpu_quantize_pass inference)
-pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -71,22 +68,31 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(simplify_anakin_detection_pattern_pass inference)
+pass_library(anakin_fillconstant_elementwisemul_fuse inference)
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
 # be detected by our pass. The index here represents the number of structures in the
 # pattern. We use index 3 ~ 6, because these quantities of structures are
 # common in the models.
-foreach (index RANGE 3 6)
+foreach (index RANGE 2 6)
   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()
+foreach (index RANGE 2 6)
+   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
+endforeach()
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base mkldnn)
    pass_library(depthwise_conv_mkldnn_pass base mkldnn)
    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
+    pass_library(cpu_quantize_placement_pass base mkldnn)
+    pass_library(cpu_quantize_pass inference mkldnn)
+    pass_library(cpu_quantize_squash_pass inference mkldnn)
 endif()
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -105,9 +111,6 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
-cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
-cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if(NOT WIN32)
    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 endif()
@@ -117,4 +120,7 @@ if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
+    cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+    cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 endif ()
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                 \
+  GET_IR_NODE(fill_constant);     \
+  GET_IR_NODE(fill_constant_out); \
+  GET_IR_NODE(elementwise_mul);   \
+  GET_IR_NODE(elementwise_mul_out);
+std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("elementwise_mul", "X")
+                ->AsInput();
+  patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
+                                                         pattern_name);
+  pattern(x);
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* elementwise_in = subgraph.at(x);
+    float constant_value =
+        boost::get<float>(fill_constant->Op()->GetAttr("value"));
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("scale");
+    new_op_desc.SetInput("X", {elementwise_in->Name()});
+    new_op_desc.SetAttr("scale", constant_value);
+    new_op_desc.SetAttr("bias", static_cast<float>(0.0));
+    new_op_desc.SetAttr("bias_after_scale", true);
+    new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto* scale_op = graph->CreateOpNode(&new_op_desc);
+    IR_NODE_LINK_TO(elementwise_in, scale_op);       // Input
+    IR_NODE_LINK_TO(scale_op, elementwise_mul_out);  // Output
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {fill_constant, fill_constant_out, elementwise_mul});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
+              paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class AnakinFillconstantElementwisemulFuse : public FusePassBase {
+ public:
+  virtual ~AnakinFillconstantElementwisemulFuse() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1470,6 +1470,171 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
  return concat_out;
 }
+PDNode *patterns::AnakinDetectionPattern::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
+  const int kNumFields = 7;
+  const int kPriorBoxLocOffset = 1;
+  const int kReshape1Offset = 2;
+  const int kReshape1OutOffset = 3;
+  const int kPriorBoxVarOffset = 4;
+  const int kReshape2Offset = 5;
+  const int kReshape2OutOffset = 6;
+  const int kBoxCoderThirdInputOffset = times;
+  const int kMultiClassSecondInputNmsOffset = times + 1;
+  std::vector<PDNode *> nodes;
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
+            ->assert_is_op("density_prior_box"));
+    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
+                        ->assert_is_op_output("density_prior_box", "Boxes")
+                        ->assert_is_op_input("reshape2", "X")
+                        ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
+            ->assert_is_op_output("density_prior_box", "Variances")
+            ->assert_is_op_input("reshape2", "X")
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+  auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+  auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+  auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
+                          ->assert_is_op("box_coder")
+                          ->assert_op_has_n_inputs("box_coder", 3);
+  auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
+                           ->assert_is_op_output("box_coder")
+                           ->AsIntermediate();
+  auto transpose_before_nms =
+      pattern->NewNode(GetNodeName("transpose_before_nms"))
+          ->assert_is_op("transpose2");
+  auto transpose_before_nms_out =
+      pattern->NewNode(GetNodeName("transpose_before_nms_out"))
+          ->assert_is_op_output("transpose2")
+          ->assert_is_op_input("multiclass_nms", "Scores")
+          ->AsIntermediate();
+  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
+                               ->assert_is_op("multiclass_nms")
+                               ->assert_op_has_n_inputs("multiclass_nms", 2);
+  auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
+                                ->assert_is_op_output("multiclass_nms")
+                                ->AsOutput();
+  std::vector<PDNode *> reshape1_outs;
+  std::vector<PDNode *> reshape2_outs;
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // prior_box
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // prior_box box out
+    nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxLocOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape1Offset]});
+    nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxVarOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape2Offset]});
+    reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
+    reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
+  }
+  concat_op1->LinksFrom(reshape1_outs);
+  concat_op2->LinksFrom(reshape2_outs);
+  concat_out1->LinksFrom({concat_op1});
+  concat_out2->LinksFrom({concat_op2});
+  conv_in[kBoxCoderThirdInputOffset]->AsInput();
+  conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
+  box_coder_op->LinksFrom(
+      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
+  box_coder_out->LinksFrom({box_coder_op});
+  transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
+  transpose_before_nms_out->LinksFrom({transpose_before_nms});
+  multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
+      .LinksTo({multiclass_nms_out});
+  return multiclass_nms_out;
+}
+PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
+    PDNode *elementwise_op_input) {
+  auto fill_constant =
+      pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
+  auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
+                               ->assert_is_op_output("fill_constant")
+                               ->assert_is_op_input("elementwise_mul", "Y")
+                               ->AsIntermediate();
+  auto elementwise_mul_op =
+      pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
+  auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
+                                 ->assert_is_op_output("elementwise_mul")
+                                 ->AsOutput();
+  fill_constant_out->LinksFrom({fill_constant});
+  elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
+  elementwise_mul_out->LinksFrom({elementwise_mul_op});
+  return elementwise_mul_out;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -844,6 +844,36 @@ struct TransposeFlattenConcat : public PatternBase {
  }
 };
+struct AnakinDetectionPattern : public PatternBase {
+  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
+  AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
+                                       const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "anakin_fillconstant_elementwisemul_fuse") {}
+  PDNode* operator()(PDNode* elementwise_op_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(fill_constant);
+  PATTERN_DECL_NODE(fill_constant_out);
+  PATTERN_DECL_NODE(elementwise_mul);
+  PATTERN_DECL_NODE(elementwise_mul_out);
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <string>
 #include <unordered_set>

--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+template <int times>
+std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name =
+      "simplify_anakin_detection_pattern_pass" + std::to_string(times);
+  FusePassBase::Init(pattern_name, graph.get());
+  GraphPatternDetector gpd;
+  std::vector<PDNode *> input_nodes;
+  for (int i = 0; i < times; i++) {
+    input_nodes.push_back(gpd.mutable_pattern()
+                              ->NewNode("x" + std::to_string(i))
+                              ->assert_is_op_input("density_prior_box", "Input")
+                              ->AsInput());
+  }
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times))
+                            ->assert_is_op_input("box_coder", "TargetBox")
+                            ->AsInput());
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times + 1))
+                            ->assert_is_op_input("transpose2")
+                            ->AsInput());
+  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(input_nodes, times);
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    const int kNumFields = 7;
+    const int kPriorBoxLocOffset = 1;
+    const int kReshape1Offset = 2;
+    const int kReshape1OutOffset = 3;
+    const int kPriorBoxVarOffset = 4;
+    const int kReshape2Offset = 5;
+    const int kReshape2OutOffset = 6;
+    std::vector<Node *> nodes;
+    for (int i = 0; i < times; i++) {
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+    }
+    Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1"));
+    Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out"));
+    Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2"));
+    Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out"));
+    Node *box_coder_third_input = subgraph.at(input_nodes[times]);
+    Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder"));
+    Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
+    Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
+    Node *transpose_before_nms =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms"));
+    Node *transpose_before_nms_out =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms_out"));
+    Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
+    Node *multiclass_nms_out =
+        subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
+    std::string code_type =
+        boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
+    bool box_normalized =
+        boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
+    // auto variance =
+    // boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
+    int background_label =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
+    float score_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("score_threshold"));
+    int nms_top_k = boost::get<int>(multiclass_nms->Op()->GetAttr("nms_top_k"));
+    float nms_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("nms_threshold"));
+    float nms_eta = boost::get<float>(multiclass_nms->Op()->GetAttr("nms_eta"));
+    int keep_top_k =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("keep_top_k"));
+    std::vector<std::string> concat1_input_names;
+    for (int i = 0; i < times; i++) {
+      concat1_input_names.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
+    }
+    // int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
+    framework::OpDesc concat1_desc;
+    concat1_desc.SetType("concat");
+    concat1_desc.SetInput("X", concat1_input_names);
+    concat1_desc.SetAttr("axis", 2);
+    concat1_desc.SetOutput("Out", {concat_out1->Name()});
+    auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(
+          new_add_concat_op);
+      new_add_concat_op->inputs.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]);
+    }
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("detection_out");
+    new_op_desc.SetInput("PriorBox", {concat_out1->Name()});
+    new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()});
+    new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()});
+    new_op_desc.SetAttr("code_type", code_type);
+    new_op_desc.SetAttr("box_normalized", box_normalized);
+    new_op_desc.SetAttr("background_label", background_label);
+    new_op_desc.SetAttr("score_threshold", score_threshold);
+    new_op_desc.SetAttr("nms_top_k", nms_top_k);
+    new_op_desc.SetAttr("nms_threshold", nms_threshold);
+    new_op_desc.SetAttr("nms_eta", nms_eta);
+    new_op_desc.SetAttr("keep_top_k", keep_top_k);
+    new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()});
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto *detection_out_op = graph->CreateOpNode(&new_op_desc);
+    std::unordered_set<const Node *> delete_nodes;
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]);
+    }
+    delete_nodes.insert(concat_op1);
+    delete_nodes.insert(concat_op2);
+    delete_nodes.insert(concat_out2);
+    delete_nodes.insert(box_coder_op);
+    delete_nodes.insert(box_coder_out);
+    delete_nodes.insert(transpose_before_nms);
+    delete_nodes.insert(transpose_before_nms_out);
+    delete_nodes.insert(multiclass_nms);
+    new_add_concat_op->outputs.push_back(concat_out1);
+    concat_out1->inputs.push_back(new_add_concat_op);
+    detection_out_op->inputs.push_back(concat_out1);
+    detection_out_op->inputs.push_back(box_coder_third_input);
+    detection_out_op->inputs.push_back(multiclass_nms_second_input);
+    detection_out_op->outputs.push_back(multiclass_nms_out);
+    concat_out1->outputs.push_back(detection_out_op);
+    box_coder_third_input->outputs.push_back(detection_out_op);
+    multiclass_nms_second_input->outputs.push_back(detection_out_op);
+    multiclass_nms_out->inputs.push_back(detection_out_op);
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+template class SimplifyAnakinDetectionPatternPass<1>;
+template class SimplifyAnakinDetectionPatternPass<2>;
+template class SimplifyAnakinDetectionPatternPass<3>;
+template class SimplifyAnakinDetectionPatternPass<4>;
+template class SimplifyAnakinDetectionPatternPass<5>;
+template class SimplifyAnakinDetectionPatternPass<6>;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(simplify_anakin_detection_pattern_pass,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
+REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <unordered_set>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// There may be many transpose-flatten structures in a model, and the output of
+// these structures will be used as inputs to the concat Op. This pattern will
+// be detected by our pass. The times here represents the repeat times of this
+// structure.
+template <int times>
+class SimplifyAnakinDetectionPatternPass : public FusePassBase {
+ public:
+  virtual ~SimplifyAnakinDetectionPatternPass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
 }
 template class TransposeFlattenConcatFusePass<1>;
+template class TransposeFlattenConcatFusePass<2>;
 template class TransposeFlattenConcatFusePass<3>;
 template class TransposeFlattenConcatFusePass<4>;
 template class TransposeFlattenConcatFusePass<5>;
@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>;
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
              paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
+REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
 REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
              paddle::framework::ir::TransposeFlattenConcatFusePass<3>);

--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+namespace paddle {
+namespace framework {
+class NoNeedBufferVarsInference {
+ public:
+  NoNeedBufferVarsInference(const VariableNameMap &inputs,
+                            const VariableNameMap &outputs,
+                            const AttributeMap &attrs)
+      : inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
+  virtual ~NoNeedBufferVarsInference() = default;
+  const VariableNameMap &Inputs() const { return inputs_; }
+  const VariableNameMap &Outputs() const { return outputs_; }
+  const AttributeMap &Attrs() const { return attrs_; }
+  virtual std::unordered_set<std::string> operator()() const = 0;
+ private:
+  const VariableNameMap &inputs_;
+  const VariableNameMap &outputs_;
+  const AttributeMap &attrs_;
+};
+#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...)               \
+  class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \
+   public:                                                                   \
+    using ::paddle::framework::NoNeedBufferVarsInference::                   \
+        NoNeedBufferVarsInference;                                           \
+                                                                             \
+    std::unordered_set<std::string> operator()() const override {            \
+      return {__VA_ARGS__};                                                  \
+    }                                                                        \
+  }
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/macros.h"
@@ -39,6 +40,7 @@ struct OpInfo {
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
  InferInplaceOpFN infer_inplace_;
+  InferNoNeedBufferVarsFN infer_no_need_buffer_vars_;
  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
@@ -64,6 +66,10 @@ struct OpInfo {
  }
  const OpAttrChecker* Checker() const { return checker_; }
+  const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
+    return infer_no_need_buffer_vars_;
+  }
 };
 class OpInfoMap {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <algorithm>
 #include <sstream>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
@@ -64,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (UNLIKELY(!tensor.IsInitialized())) {
+    // if (UNLIKELY(!tensor.IsInitialized())) {
-      return DDim({-1});
+    //   return DDim({-1});
-    }
+    // }
    return tensor.dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
@@ -132,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (UNLIKELY(!tensor.IsInitialized())) {
+    // if (UNLIKELY(!tensor.IsInitialized())) {
-      return default_lod;
+    //   return default_lod;
-    }
+    // }
    return tensor.lod();
  } else {
    return default_lod;
@@ -326,7 +327,12 @@ OperatorBase::OperatorBase(const std::string& type,
                           const VariableNameMap& inputs,
                           const VariableNameMap& outputs,
                           const AttributeMap& attrs)
-    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
+    : type_(type),
+      inputs_(inputs),
+      outputs_(outputs),
+      attrs_(attrs),
+      // NOTE(zjl): why op_info may be nullptr?
+      info_(OpInfoMap::Instance().GetNullable(type)) {
  GenerateTemporaryNames();
  CheckAllInputOutputSet();
 }
@@ -350,7 +356,7 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
    }
    return ret_val;
  }
-  auto& info = OpInfoMap::Instance().Get(Type());
+  auto& info = Info();
  // get all OpProto::Var for outputs
  for (auto& o : info.Proto().outputs()) {
@@ -366,18 +372,16 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
 }
 void OperatorBase::CheckAllInputOutputSet() const {
-  auto& info_map = OpInfoMap::Instance();
+  if (info_ == nullptr || info_->proto_ == nullptr) return;
-  auto* op_info = info_map.GetNullable(Type());
-  if (op_info == nullptr || op_info->proto_ == nullptr) return;
-  for (auto& in : op_info->Proto().inputs()) {
+  for (auto& in : info_->Proto().inputs()) {
    if (!in.dispensable()) {
      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
                     "Operator %s's input, %s, is not set", Type(), in.name());
    }
  }
-  for (auto& out : op_info->Proto().outputs()) {
+  for (auto& out : info_->Proto().outputs()) {
    if (!out.dispensable()) {
      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
                     "Operator %s's output, %s, is not set", Type(),
@@ -997,7 +1001,27 @@ Scope* OperatorWithKernel::PrepareData(
    std::vector<std::string>* transfered_inplace_vars,
    RuntimeContext* ctx) const {
  Scope* new_scope = nullptr;
+  std::unordered_set<std::string> no_buffer_ins;
+  if (info_) {
+    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
+    // Some op may not register NoNeedBufferVarsInferer
+    if (no_buffer_inferer) {
+      no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs());
+    }
+  }
  for (auto& var_name_item : Inputs()) {
+    // NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set
+    // is empty. At least STL implemented on my mac does calculate hash code
+    // of search key even though the set is empty.
+    if (!no_buffer_ins.empty() &&
+        no_buffer_ins.count(var_name_item.first) > 0) {
+      VLOG(1) << "Skip scanning input " << var_name_item.first
+              << " in Operator " << type_;
+      continue;
+    }
    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
    for (size_t i = 0; i < var_name_item.second.size(); ++i) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -160,6 +160,11 @@ class OperatorBase {
  const VariableNameMap& Inputs() const { return inputs_; }
  const VariableNameMap& Outputs() const { return outputs_; }
+  const OpInfo& Info() const {
+    PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_);
+    return *info_;
+  }
  bool HasInputs(const std::string& name) const;
  //! Get a input with argument's name described in `op_proto`
  std::string Input(const std::string& name) const;
@@ -194,6 +199,10 @@ class OperatorBase {
  // IG (Inputs Gradients)
  VariableNameMap outputs_;
  AttributeMap attrs_;
+  // OpInfo
+  const OpInfo* info_;
  // Whether this operator executes in an Executor.
  bool run_by_executor_{true};
@@ -444,7 +453,7 @@ class OperatorWithKernel : public OperatorBase {
  }
  virtual void InferShape(InferShapeContext* ctx) const {
-    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
+    Info().infer_shape_(ctx);
  }
  void RuntimeInferShape(const Scope& scope, const platform::Place& place,

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -29,15 +29,6 @@ DEFINE_bool(
    "Delete local scope eagerly. It will reduce GPU memory usage but "
    "slow down the destruction of variables.(around 1% performance harm)");
-DEFINE_double(
-    eager_delete_tensor_gb, -1.0,
-    "Memory size threshold (GB) when the garbage collector clear tensors."
-    "Disabled when this value is less than 0");
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -57,15 +48,6 @@ DEFINE_bool(fast_eager_deletion_mode, true,
 namespace paddle {
 namespace framework {
-int64_t GetEagerDeletionThreshold() {
-  return FLAGS_eager_delete_tensor_gb < 0
-             ? -1
-             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
-                                    (static_cast<int64_t>(1) << 30));
-}
-bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -32,9 +32,6 @@ extern "C" {
 namespace paddle {
 namespace framework {
-int64_t GetEagerDeletionThreshold();
-bool IsFastEagerDeletionModeEnabled();
 class Scope;
 /**

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -30,6 +30,7 @@ class InferShapeContext;
 class InferVarTypeContext;
 class BlockDesc;
 class Variable;
+class NoNeedBufferVarsInference;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 // TODO(panyx0718): Replace vector with something like gtl::Vector.
@@ -59,7 +60,11 @@ using InferVarTypeFN =
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 using InplacePair = std::unordered_map<std::string, std::string>;
-using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, BlockDesc*)>;
+using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>;
+using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
+    const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,
+    const AttributeMap& /*attrs*/)>;
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,7 +16,10 @@ add_subdirectory(utils)
 if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()
-# add_subdirectory(anakin)
+if (ANAKIN_FOUND)
+  add_subdirectory(anakin)
+endif()
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)

--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
-cc_library(anakin_engine SRCS engine.cc)
+cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
+cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
 add_subdirectory(convert)
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
-cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
+ elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
+cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
+cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
+cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
+cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
+cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
+cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
+#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
+cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor SERIAL)
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/activation.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
+    : op_type_(op_type) {
+  auto it = anakin_op_types_.find(op_type_);
+  PADDLE_ENFORCE(it != anakin_op_types_.end(),
+                 "activation op type is not support");
+  anakin_op_type_ = it->second;
+}
+void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
+                                       const framework::Scope &scope,
+                                       bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "type", anakin_op_type_);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
--- a/paddle/fluid/inference/anakin/convert/registrar.h
+++ b/paddle/fluid/inference/anakin/convert/registrar.h
@@ -14,45 +14,39 @@
 #pragma once
-#include <functional>
 #include <map>
-#include <memory>
 #include <string>
-#include <utility>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 namespace paddle {
 namespace inference {
 namespace anakin {
-class AnakinOpConverter;
+class ActivationOpConverter : public AnakinOpConverter {
-class OpRegister {
 public:
-  OpRegister() = default;
+  explicit ActivationOpConverter(const std::string &op_type);
-  std::shared_ptr<AnakinOpConverter> Get(const std::string &name);
-  static OpRegister *instance();
+  virtual void operator()(const framework::proto::OpDesc &op,
-  void OpRegisterFn(const std::string &name,
+                          const framework::Scope &scope,
-                    std::function<std::shared_ptr<AnakinOpConverter>()> fn) {
+                          bool test_mode) override;
-    registry_[name] = fn;
+  virtual ~ActivationOpConverter() {}
-  }
 private:
-  using RegisterFnType = std::function<std::shared_ptr<AnakinOpConverter>()>;
+  std::string op_type_;
-  std::map<std::string, std::function<std::shared_ptr<AnakinOpConverter>()>>
+  std::string anakin_op_type_;
-      registry_;
+  std::map<std::string, std::string> anakin_op_types_{{"tanh", "TanH"},
+                                                      {"sigmoid", "Sigmoid"}};
 };
-template <typename T, typename... Args>
+class TanhOpConverter : public ActivationOpConverter {
-class Registrar {
 public:
-  Registrar(const std::string &name, Args... args) {
+  TanhOpConverter() : ActivationOpConverter("tanh") {}
-    std::shared_ptr<AnakinOpConverter> converter =
-        std::make_shared<T>(std::move(args)...);
-    OpRegister::instance()->OpRegisterFn(name,
-                                         [converter]() { return converter; });
-  }
 };
+class SigmoidOpConverter : public ActivationOpConverter {
+ public:
+  SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
+};
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/batch_norm.h"
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::Scope &scope,
+                                      bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
+  std::map<std::string, std::string> inputs;
+  for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) {
+    PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL);
+    auto v = op_desc.Input(k).front();
+    inputs.insert({k, v});
+  }
+  auto output = op_desc.Output("Y").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
+  auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
+  // auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
+  auto bn_op_name = op_name + ":bn";
+  auto bn_output = bn_op_name + "_output";
+  engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
+  engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
+  engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
+  auto scale_op_name = op_name + ":scale";
+  auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
+                                                 framework::LoDTensor *tensor) {
+    auto *v = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(v);
+    auto *t = v->GetMutable<framework::LoDTensor>();
+    tensor->Resize(t->dims());
+    TensorCopySync(*t, platform::CPUPlace(), tensor);
+  };
+  framework::LoDTensor bias_t;
+  framework::LoDTensor mean_t;
+  framework::LoDTensor scale_t;
+  framework::LoDTensor variance_t;
+  get_lod_tensor(inputs["Bias"], &bias_t);
+  get_lod_tensor(inputs["Mean"], &mean_t);
+  get_lod_tensor(inputs["Scale"], &scale_t);
+  get_lod_tensor(inputs["Variance"], &variance_t);
+  auto fill_shape = [](size_t n, std::vector<int> shape) {
+    shape.insert(shape.begin(), 1);
+    if (shape.size() < n) {
+      shape.insert(shape.end(), n - shape.size(), 1);
+    }
+    return shape;
+  };
+  Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
+  Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
+  engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
+  auto *weight2 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape2);
+  auto *variance_data =
+      static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
+  engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
+  Shape shape3(std::vector<int>({1, 1, 1, 1}));
+  auto *weight3 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape3);
+  auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
+  float weight3_data[] = {1};
+  std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
+  engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
+  Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
+  auto *scale =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(scale_shape);
+  auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
+  std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
+  Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
+  auto *bias =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(bias_shape);
+  auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
+  std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
+  engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
+  engine_->AddOpAttr(scale_op_name, "axis", 1);
+  engine_->AddOpAttr(scale_op_name, "num_axes", 1);
+  engine_->AddOpAttr(scale_op_name, "bias_term", true);
+  engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
+  engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter);
--- a/paddle/fluid/inference/anakin/convert/batch_norm.h
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class BatchNormOpConverter : public AnakinOpConverter {
+ public:
+  BatchNormOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~BatchNormOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/concat.h"
+#include <algorithm>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  auto input_names = op_desc.Input("X");
+  // PADDLE_ENFORCE(axis > 0,
+  //               "The axis attr of Concat op should be large than 0 for trt");
+  auto y_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Concat", input_names, {y_name});
+  engine_->AddOpAttr(op_name, "axis", axis);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter);
--- a/paddle/fluid/inference/anakin/convert/concat.h
+++ b/paddle/fluid/inference/anakin/convert/concat.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ConcatOpConverter : public AnakinOpConverter {
+ public:
+  ConcatOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ConcatOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/conv2d.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
+  auto input_name = op_desc.Input("Input").front();
+  auto output_name = op_desc.Output("Output").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
+  engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
+  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(filter_v);
+  auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(filter_t->dims());
+  TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  // const int n_output = weight_tensor->dims()[0];
+  // const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
+  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  engine_->AddOpAttr(op_name, "group", groups);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "bias_term", false);
+  auto weight_shape = framework::vectorize2int(filter_t->dims());
+  Shape anakin_shape(weight_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter);
--- a/paddle/fluid/inference/anakin/convert/registrar.cc
+++ b/paddle/fluid/inference/anakin/convert/registrar.cc
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/anakin/convert/registrar.h"
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 namespace paddle {
 namespace inference {
 namespace anakin {
-std::shared_ptr<AnakinOpConverter> OpRegister::Get(const std::string &name) {
+class Conv2dOpConverter : public AnakinOpConverter {
-  auto it = registry_.find(name);
+ public:
-  if (it == registry_.end()) return nullptr;
+  Conv2dOpConverter() = default;
-  return it->second();
-}
-OpRegister *OpRegister::instance() {
+  virtual void operator()(const framework::proto::OpDesc &op,
-  static OpRegister factory;
+                          const framework::Scope &scope,
-  return &factory;
+                          bool test_mode) override;
-}
+  virtual ~Conv2dOpConverter() {}
+};
 }  // namespace anakin
 }  // namespace inference

--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
+  auto input_name = op_desc.Input("Input").front();
+  auto output_name = op_desc.Output("Output").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
+  engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
+  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(filter_v);
+  auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
+  auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+  PADDLE_ENFORCE_NOT_NULL(b_v);
+  auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(filter_t->dims());
+  TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  // const int n_output = weight_tensor->dims()[0];
+  // const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
+  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  engine_->AddOpAttr(op_name, "group", groups);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "bias_term", true);
+  auto weight_shape = framework::vectorize2int(filter_t->dims());
+  Shape anakin_shape(weight_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  auto bias_shape = framework::vectorize2int(b_t->dims());
+  framework::LoDTensor bias_tensor;
+  bias_tensor.Resize(b_t->dims());
+  TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+  auto *bias_data = bias_tensor.data<float>();
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  // bias_shape.push_back(1);
+  // bias_shape.push_back(1);
+  Shape anakin_bias_shape(bias_shape);
+  auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+      anakin_bias_shape);
+  float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+  weight2->d_tensor().set_shape(anakin_bias_shape);
+  weight2->d_tensor().copy_from(weight2->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_2", *weight2);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class Conv2dFusionOpConverter : public AnakinOpConverter {
+ public:
+  Conv2dFusionOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Conv2dFusionOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/density_prior_box.h"
+#include <algorithm>
+#include <map>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
+                                            const framework::Scope& scope,
+                                            bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto input_name = op_desc.Input("Input").front();
+  auto image_name = op_desc.Input("Image").front();
+  auto output_name = op_desc.Output("Boxes").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
+  auto fixed_sizes =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+  auto fixed_ratios =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+  std::vector<float> dens;
+  for (auto& ele : densities) {
+    dens.push_back(static_cast<float>(ele));
+  }
+  // lack flip
+  // auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
+  for (auto& ele : variances) {
+    LOG(INFO) << ele;
+  }
+  // lack img_h, img_w
+  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
+  auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
+  auto offset = boost::get<float>(op_desc.GetAttr("offset"));
+  PTuple<std::string> t_order;
+  t_order.push_back("MIN");
+  t_order.push_back("COM");
+  t_order.push_back("MAX");
+  std::vector<float> temp_v = {};
+  engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
+  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
+  engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
+  engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
+  engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "step_h", step_h);
+  engine_->AddOpAttr(op_name, "step_w", step_w);
+  engine_->AddOpAttr(op_name, "offset", offset);
+  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class DensityPriorBoxOpConverter : public AnakinOpConverter {
+ public:
+  DensityPriorBoxOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DensityPriorBoxOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/detection_out.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto target_name = op_desc.Input("TargetBox").front();
+  auto prior_box_name = op_desc.Input("PriorBox").front();
+  auto scores_name = op_desc.Input("Scores").front();
+  auto output_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
+  auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
+  auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
+  auto nms_top_k = boost::get<int>(op_desc.GetAttr("nms_top_k"));
+  auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
+  auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
+  auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
+  std::string anakin_code_type;
+  if (code_type == "decode_center_size") {
+    anakin_code_type = "CENTER_SIZE";
+  } else if (code_type == "encode_center_size") {
+    PADDLE_THROW(
+        "Not support encode_center_size code_type in DetectionOut of anakin");
+  }
+  engine_->AddOp(op_name, "DetectionOutput",
+                 {target_name, scores_name, prior_box_name}, {output_name});
+  engine_->AddOpAttr(op_name, "share_location", true);
+  engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
+  engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "background_id", background_label);
+  engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
+  engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
+  engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
+  engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
+  engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
+  engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class DetectionOutOpConverter : public AnakinOpConverter {
+ public:
+  DetectionOutOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DetectionOutOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/dropout.cc
+++ b/paddle/fluid/inference/anakin/convert/dropout.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/dropout.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
+  auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
+  auto factor = 1 - dropout_prob;
+  Shape shape1(std::vector<int>({1, 1, 1, 1}));
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  float weight1_data[] = {factor};
+  std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data);
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  engine_->AddOpAttr(op_name, "axis", 0);
+  engine_->AddOpAttr(op_name, "num_axes", 0);
+  engine_->AddOpAttr(op_name, "bias_term", false);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter);
--- a/paddle/fluid/inference/anakin/convert/dropout.h
+++ b/paddle/fluid/inference/anakin/convert/dropout.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class DropoutOpConverter : public AnakinOpConverter {
+ public:
+  DropoutOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DropoutOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/elementwise.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
+                                           const framework::Scope &scope,
+                                           bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Input("Y").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
+  std::string elementwise_type = "Add";
+  engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
+  std::vector<float> coeff = {1.0, 1.0};
+  engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
+}
+void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op,
+                                           const framework::Scope &scope,
+                                           bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Input("Y").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name});
+  // Fill a number to weight_1 as a placeholder.
+  Shape shape1(std::vector<int>({1, 1, 1, 1}));
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *placeholder_data =
+      static_cast<float *>(weight1->h_tensor().mutable_data());
+  float weight1_data[] = {1};
+  std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data);
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  auto axis = boost::get<int>(op_desc.GetAttr("axis"));
+  engine_->AddOpAttr(op_name, "axis", axis);
+  engine_->AddOpAttr(op_name, "num_axes", 1);
+  engine_->AddOpAttr(op_name, "bias_term", false);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter);
--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ElementwiseAddOpConverter : public AnakinOpConverter {
+ public:
+  ElementwiseAddOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ElementwiseAddOpConverter() {}
+ private:
+};
+class ElementwiseMulOpConverter : public AnakinOpConverter {
+ public:
+  ElementwiseMulOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ElementwiseMulOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -14,60 +14,108 @@
 #include "paddle/fluid/inference/anakin/convert/fc.h"
 #include <algorithm>
+#include <string>
+#include <vector>
 using anakin::graph::GraphGlobalMem;
 using anakin::AK_FLOAT;
-using anakin::Precision;
 using anakin::saber::NV;
-using anakin::saber::X86;
 using anakin::saber::Shape;
-using anakin::PBlock;
-using anakin::PTuple;
 namespace paddle {
 namespace inference {
 namespace anakin {
-void FcOpConverter::operator()(const framework::proto::OpDesc &op,
+void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
-                               const framework::Scope &scope, bool test_mode) {
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  auto input_names = op_desc.InputNames();
-  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  bool with_bias = input_names.size() == 3;
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  std::string w_name = "Y";
+  std::string i_name = "X";
+  if (with_bias) {
+    w_name = "W";
+    i_name = "Input";
+  }
-  auto x_name = op_desc.Input("X").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto *y_v = scope.FindVar(op_desc.Input("Y").front());
+  // get weights
+  auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
  PADDLE_ENFORCE_NOT_NULL(y_v);
  auto *y_t = y_v->GetMutable<framework::LoDTensor>();
-  auto input_name = op_desc.Input("X").front();
+  auto input_name = op_desc.Input(i_name).front();
  auto output_name = op_desc.Output("Out").front();
-  auto weight_shape = framework::vectorize2int(y_t->dims());
  engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
-  engine_->AddOpAttr(op_name, "bias_term", false);
+  engine_->AddOpAttr(op_name, "bias_term", with_bias);
  engine_->AddOpAttr(op_name, "axis", 1);
+  auto weight_shape = framework::vectorize2int(y_t->dims());
  int out_dim = weight_shape[1];
  engine_->AddOpAttr(op_name, "out_dim", out_dim);
+  const int w_m = weight_shape[0];
+  const int w_k = weight_shape[1];
-  weight_shape.push_back(1);
+  if (weight_shape.size() < 4UL) {
-  weight_shape.push_back(1);
+    weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1);
+  }
  Shape anakin_shape(weight_shape);
  framework::LoDTensor weight_tensor;
  weight_tensor.Resize(y_t->dims());
  TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
+  auto *weight_data = weight_tensor.data<float>();
+  PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel());
+  std::vector<float> trans_weight_data(weight_tensor.numel());
+  for (int i = 0; i < w_m; i++) {
+    for (int j = 0; j < w_k; j++) {
+      trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
+    }
+  }
  auto *weight1 =
      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
-  std::copy_n(weight_tensor.data<float>(), weight_tensor.numel(), cpu_data);
+  std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
  weight1->d_tensor().set_shape(anakin_shape);
  weight1->d_tensor().copy_from(weight1->h_tensor());
  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+  // get bias
+  if (with_bias) {
+    auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+    PADDLE_ENFORCE_NOT_NULL(b_v);
+    auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+    auto bias_shape = framework::vectorize2int(b_t->dims());
+    framework::LoDTensor bias_tensor;
+    bias_tensor.Resize(b_t->dims());
+    TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+    auto *bias_data = bias_tensor.data<float>();
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    // bias_shape.push_back(1);
+    // bias_shape.push_back(1);
+    Shape anakin_bias_shape(bias_shape);
+    auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+        anakin_bias_shape);
+    float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+    std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+    weight2->d_tensor().set_shape(anakin_bias_shape);
+    weight2->d_tensor().copy_from(weight2->h_tensor());
+    engine_->AddOpAttr(op_name, "weight_2", *weight2);
+  }
 }
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@@ -20,19 +20,28 @@ namespace paddle {
 namespace inference {
 namespace anakin {
-class FcOpConverter : public AnakinOpConverter {
+class FcBaseOpConverter : public AnakinOpConverter {
 public:
-  FcOpConverter() = default;
+  FcBaseOpConverter() = default;
  virtual void operator()(const framework::proto::OpDesc &op,
                          const framework::Scope &scope,
                          bool test_mode) override;
-  virtual ~FcOpConverter() {}
+  virtual ~FcBaseOpConverter() {}
+};
- private:
+// with bias
+class FcOpConverter : public FcBaseOpConverter {
+ public:
+  FcOpConverter() = default;
+};
+// without bias
+class MulOpConverter : public FcBaseOpConverter {
+ public:
+  MulOpConverter() = default;
 };
-static Registrar<FcOpConverter> register_fc_op_converter("fc");
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/flatten.h"
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  PADDLE_ENFORCE(axis == 1,
+                 "the anakin flatten op converter now only support aixs == 1.");
+  std::vector<int> out_dims = {0, -1, 1, 1};
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Reshape", {input}, {output});
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
--- a/paddle/fluid/inference/anakin/convert/flatten.h
+++ b/paddle/fluid/inference/anakin/convert/flatten.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class FlattenOpConverter : public AnakinOpConverter {
+ public:
+  FlattenOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~FlattenOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/im2sequence.cc
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::Scope &scope,
+                                      bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
+  std::vector<int> dilations = {1, 1};
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "window_size", kernels);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilations", dilations);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter);
--- a/paddle/fluid/inference/anakin/convert/im2sequence.h
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class Im2SequenceConverter : public AnakinOpConverter {
+ public:
+  Im2SequenceConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Im2SequenceConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -14,15 +14,16 @@
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 #include "framework/core/types.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/anakin/convert/registrar.h"
 #include "paddle/fluid/inference/anakin/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "saber/saber_types.h"
@@ -46,19 +47,14 @@ class AnakinOpConverter {
                 bool test_mode = false) {
    framework::OpDesc op_desc(op, nullptr);
    std::string op_type = op_desc.Type();
-    std::shared_ptr<AnakinOpConverter> it{nullptr};
+    AnakinOpConverter *it = nullptr;
-    if (op_type == "mul") {
+    if (op_type == "reshape2") op_type = "reshape";
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+    if (op_type == "transpose2") op_type = "transpose";
-      std::string Y = op_desc.Input("Y")[0];
+    if (op_type == "flatten2") op_type = "flatten";
-      std::cout << Y << parameters.count(Y) << std::endl;
-      if (parameters.count(Y)) {
-        it = OpRegister::instance()->Get("fc");
-      }
-    }
    if (!it) {
-      it = OpRegister::instance()->Get(op_type);
+      it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
    }
    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
    it->SetEngine(engine);
@@ -74,6 +70,63 @@ class AnakinOpConverter {
      ConvertOp(op, parameters, scope, engine);
    }
  }
+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToAnakinEngine(
+      framework::BlockDesc *block_desc, framework::Scope *scope,
+      const std::vector<std::string> &inputs,
+      const std::unordered_set<std::string> &parameters,
+      const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
+    framework::proto::BlockDesc *block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, *scope, engine);
+    engine->Freeze();
+    // if the max_batch size
+    int max_batch_size = engine->GetMaxBatchSize();
+    PADDLE_ENFORCE(max_batch_size > 0,
+                   "the max_batch_size setted from config->EnableAnakinEngine "
+                   "must largger than 0");
+    // If the user does not specify this variable, we use the input shape from
+    // the block_desc.
+    auto max_input_shape = engine->GetMaxInputShape();
+    std::map<std::string, std::vector<int>> temp_max_input_shape;
+    for (auto &input : inputs) {
+      if (parameters.count(input)) continue;
+      std::vector<int> input_shape;
+      input_shape.resize(4);
+      input_shape[0] = max_batch_size;
+      if (max_input_shape.count(input)) {
+        PADDLE_ENFORCE(max_input_shape[input].size() == 4,
+                       "the dimensions of  max_input_shape setted from "
+                       "config->EnableAnakinEngine must be 4");
+        for (int i = 1; i < 4; i++) {
+          input_shape[i] = max_input_shape[input][i];
+        }
+      } else {
+        auto *var = block_desc->FindVar(input);
+        PADDLE_ENFORCE(var, "no variable called %s", input);
+        auto var_shape = var->GetShape();
+        std::cout << "input :" << input << std::endl;
+        PADDLE_ENFORCE(var_shape.size() == 4);
+        for (size_t i = 1; i < var_shape.size(); i++) {
+          input_shape[i] = var_shape[i];
+        }
+      }
+      temp_max_input_shape[input] = input_shape;
+      engine->SetInputShape(input, input_shape);
+      engine->Graph()->RegistVar(input);  // For share from data.
+    }
+    engine->SetMaxInputShape(temp_max_input_shape);
+    engine->Optimize();
+    // For anakin share with fluid tensor.
+    engine->AllocTmpMem();
+    engine->InitGraph();
+  }
  void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
  virtual ~AnakinOpConverter() {}
@@ -91,22 +144,23 @@ class AnakinOpConverter {
 }  // namespace inference
 }  // namespace paddle
-#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)                \
+#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)               \
-  struct anakin_##op_type__##_converter                                     \
+  struct anakin_##op_type__##_converter                                    \
-      : public ::paddle::framework::Registrar {                             \
+      : public ::paddle::framework::Registrar {                            \
-    anakin_##op_type__##_converter() {                                      \
+    anakin_##op_type__##_converter() {                                     \
-      ::paddle::inference::                                                 \
+      LOG(INFO) << "register convert " << #op_type__;                      \
-          Registry<paddle::inference::anakin::AnakinOpConverter>::Register< \
+      ::paddle::inference::Registry<                                       \
-              ::paddle::inference::anakin::Converter__>(#op_type__);        \
+          ::paddle::inference::anakin::AnakinOpConverter>::Global()        \
-    }                                                                       \
+          .Register<::paddle::inference::anakin::Converter__>(#op_type__); \
-  };                                                                        \
+    }                                                                      \
-  anakin_##op_type__##_converter anakin_##op_type__##_converter__;          \
+  };                                                                       \
-  int TouchConverterRegister_anakin_##op_type__() {                         \
+  anakin_##op_type__##_converter anakin_##op_type__##_converter__;         \
-    anakin_##op_type__##_converter__.Touch();                               \
+  int TouchConverterRegister_anakin_##op_type__() {                        \
-    return 0;                                                               \
+    anakin_##op_type__##_converter__.Touch();                              \
+    return 0;                                                              \
  }
-#define USE_ANAKIN_CONVERTER(op_type__)                                    \
+#define USE_ANAKIN_CONVERTER(op_type__)                             \
-  extern int TouchConverterRegister_anakin_##op_type__();                  \
+  extern int TouchConverterRegister_anakin_##op_type__();           \
-  static int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
+  int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
      TouchConverterRegister_anakin_##op_type__();
--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/pool2d.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
+  std::string pool_type =
+      boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+  std::vector<int> ksize =
+      boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+  std::vector<int> strides =
+      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  std::vector<int> paddings =
+      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
+  std::string anakin_pool_type;
+  if (pool_type == "max") {
+    anakin_pool_type = "MAX";
+  } else if (pool_type == "avg") {
+    if (paddings[0] || paddings[1]) {
+      anakin_pool_type = "AVGEXC";
+    } else {
+      anakin_pool_type = "AVG";
+    }
+  } else {
+    PADDLE_THROW("TensorRT unsupported pooling type!");
+  }
+  engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
+  engine_->AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  engine_->AddOpAttr(op_name, "method", anakin_pool_type);
+  engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
+  engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter);
--- a/paddle/fluid/inference/anakin/convert/pool2d.h
+++ b/paddle/fluid/inference/anakin/convert/pool2d.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class Pool2dOpConverter : public AnakinOpConverter {
+ public:
+  Pool2dOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Pool2dOpConverter() {}
+ private:
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
+                                 const framework::Scope &scope,
+                                 bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "alpha", 0);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ b/paddle/fluid/inference/anakin/convert/relu.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ReluOpConverter : public AnakinOpConverter {
+ public:
+  ReluOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ReluOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/reshape.h"
+#include <vector>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Reshape", {input}, {output});
+  auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape"));
+  if (shape.size() < 4) {
+    shape.insert(shape.end(), 4 - shape.size(), 1);
+  }
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", shape);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
--- a/paddle/fluid/inference/anakin/convert/reshape.h
+++ b/paddle/fluid/inference/anakin/convert/reshape.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ReshapeOpConverter : public AnakinOpConverter {
+ public:
+  ReshapeOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ReshapeOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/scale.cc
+++ b/paddle/fluid/inference/anakin/convert/scale.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/scale.h"
+#include <algorithm>
+#include <map>
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::Scope &scope,
+                                  bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  float scale = boost::get<float>(op_desc.GetAttr("scale"));
+  float bias = boost::get<float>(op_desc.GetAttr("bias"));
+  float bias_after_scale =
+      boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
+  PADDLE_ENFORCE(bias_after_scale,
+                 "The anakin scale layer only support bias after scale now.");
+  engine_->AddOp(op_name, "Power", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "shift", bias);
+  engine_->AddOpAttr(op_name, "scale", scale);
+  engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
--- a/paddle/fluid/inference/anakin/convert/scale.h
+++ b/paddle/fluid/inference/anakin/convert/scale.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace anakin {
+class ScaleOpConverter : public AnakinOpConverter {
+ public:
+  ScaleOpConverter() = default;
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ScaleOpConverter() {}
+};
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/anakin/convert/softmax.h"
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+namespace paddle {
+namespace inference {
+namespace anakin {
+void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::Scope &scope,
+                                    bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
+  auto input = op_desc.Input("X").front();
+  auto output = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  engine_->AddOp(op_name, "Softmax", {input}, {output});
+  engine_->AddOpAttr(op_name, "axis", 2);
+}
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter);
--- a/paddle/fluid/inference/anakin/convert/softmax.h
+++ b/paddle/fluid/inference/anakin/convert/softmax.h
--- a/paddle/fluid/inference/anakin/convert/split.cc
+++ b/paddle/fluid/inference/anakin/convert/split.cc
--- a/paddle/fluid/inference/anakin/convert/split.h
+++ b/paddle/fluid/inference/anakin/convert/split.h
--- a/paddle/fluid/inference/anakin/convert/sum.cc
+++ b/paddle/fluid/inference/anakin/convert/sum.cc
--- a/paddle/fluid/inference/anakin/convert/sum.h
+++ b/paddle/fluid/inference/anakin/convert/sum.h
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
--- a/paddle/fluid/inference/anakin/convert/transpose.h
+++ b/paddle/fluid/inference/anakin/convert/transpose.h
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
--- a/paddle/fluid/inference/anakin/op_teller.h
+++ b/paddle/fluid/inference/anakin/op_teller.h
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
--- a/paddle/fluid/operators/anakin/CMakeLists.txt
+++ b/paddle/fluid/operators/anakin/CMakeLists.txt
--- a/paddle/fluid/operators/anakin/anakin_engine_op.cc
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
--- a/paddle/fluid/operators/jit/README.en.md
+++ b/paddle/fluid/operators/jit/README.en.md
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
--- a/python/paddle/fluid/imperative/tracer.py
+++ b/python/paddle/fluid/imperative/tracer.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py