diff --git a/CMakeLists.txt b/CMakeLists.txt
index 231224f9249848b6e4981a98e0538794bf5d3c08..bdd48565edeca051f54e8fe4eb51cd1dbd5e836a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,6 +200,14 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@@ -228,14 +236,6 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2c84061ff572de4687b4d496f8ded6deee8d1011..9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index fb3d8ef8d53436f387acc3069a0eb887e6f07c59..8b7d91f234594becdda805c089fac0bb4e4e8e44 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
 set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
     -Wno-error=unused-variable -Wno-unused-variable 
     -Wno-error=format-extra-args -Wno-format-extra-args
     -Wno-error=comment -Wno-comment 
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-reorder 
     -Wno-error=cpp)
 
-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
 
 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
     message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
     execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
 endif()
 
 if (WITH_ANAKIN)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 5f3bfa296546fcbc6a3410d7ae072ff74954bc74..3ef317bb7a1c25c5738342f34ae7994b0184a7de 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
 paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
 paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 139411f3e0d945f9265d19a28487c05d06722d69..6440607dbe4666ff3ff91dc526465706b3b9c1f0 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
+cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
   nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
 else()
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 60382faffb8e53870658b2d1ff83abc4008cb4cf..1a9ce746ea840bc088d222cc4e9bc05159d64734 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <unordered_map>
 
+using float16 = paddle::platform::float16;
+
 namespace paddle {
 namespace framework {
 
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
   // NOTE: Add your customize type here.
-  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float16, proto::VarType::FP16);
   RegType(float, proto::VarType::FP32);
   RegType(double, proto::VarType::FP64);
   RegType(int, proto::VarType::INT32);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c41c55ba63c0b2001cfcb6a9e94fbb0036d437
--- /dev/null
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_type.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(DataType, float16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::float16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+
+  // test fp16 tensor
+  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+
+  // test fp16 size
+  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+
+  // test debug info
+  std::string type = "float16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e302a29233b96451df14b4685911be1cd87c1ab
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ExceptionHolder {
+ public:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
+  bool ExceptionCatched() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return exception_.get() != nullptr;
+  }
+
+  void Throw() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        break;
+      case kEnforceNotMet: {
+        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
+        throw e;
+        break;
+      }
+      case kEOF: {
+        auto e = *static_cast<platform::EOFException*>(exception_.get());
+        throw e;
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown exception.";
+    }
+    exception_.reset();
+    type_ = kNone;
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset();
+    type_ = kNone;
+  }
+
+ private:
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  ExceptionType type_{kNone};
+
+  std::unique_ptr<std::exception> exception_;
+  mutable std::mutex mu_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index eec405073377b2782d7636c08e6eb3a7bd41202d..e556c84b0219eba2b92c456c205e03947171626b 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Clean run context
   run_op_futures_.clear();
-  exception_.reset();
+  exception_holder_.Clear();
 
   // Step 3. Execution
   while (!pending_vars.empty()) {
@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::unique_lock<std::mutex> l(exception_mu_);
-      if (exception_) {
-        l.unlock();
+      if (exception_holder_.ExceptionCatched()) {
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        l.lock();
-        std::exception *exp = exception_.get();
-        if (dynamic_cast<platform::EOFException *>(exp)) {
-          auto e = *static_cast<platform::EOFException *>(exp);
-          throw e;
-        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
-          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          throw e;
-        } else {
-          LOG(FATAL) << "Unknown exception.";
-        }
+        exception_holder_.Throw();
       } else {
         continue;
       }
@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EOFException ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      // EOFException will not cover up existing EnforceNotMet.
-      if (exception_.get() == nullptr) {
-        exception_.reset(new platform::EOFException(ex));
-      }
+      exception_holder_.Catch(ex);
     } catch (platform::EnforceNotMet ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      exception_.reset(new platform::EnforceNotMet(ex));
+      exception_holder_.Catch(ex);
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index b0aaf60701fcfdfac0d9f2e547e6fb5edf63156c..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -24,6 +24,7 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::mutex exception_mu_;
-  std::unique_ptr<std::exception> exception_;
+  ExceptionHolder exception_holder_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index b517442bb73f43bc1cb1d639b6c6cf004b28d4cf..a260dd3da2a7863c06e51aa4feafd824ea254139 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
   for (size_t i = 0; i < sorted.size(); ++i) {
     node_map[sorted[i]->Name()] = i;
   }
-  ASSERT_EQ(node_map.at("op1"), 0);
-  ASSERT_EQ(node_map.at("op2"), 1);
+  ASSERT_EQ(node_map.at("op1"), 0UL);
+  ASSERT_EQ(node_map.at("op2"), 1UL);
   ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
 }
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 73ef55756c330bdbc3be89c436967b2a88625a43..f9e6bdf3625bdced9d1a9195a979b0f46016d8bf 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) {
   std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
   for (ir::Node *n : nodes) {
     if (n->Name() == "sum") {
-      ASSERT_EQ(n->inputs.size(), 3);
-      ASSERT_EQ(n->outputs.size(), 1);
+      ASSERT_EQ(n->inputs.size(), 3UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
     } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
                n->Name() == "test_c") {
-      ASSERT_EQ(n->inputs.size(), 0);
-      ASSERT_EQ(n->outputs.size(), 1);
+      ASSERT_EQ(n->inputs.size(), 0UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
     } else if (n->Name() == "test_out") {
-      ASSERT_EQ(n->inputs.size(), 1);
-      ASSERT_EQ(n->outputs.size(), 0);
+      ASSERT_EQ(n->inputs.size(), 1UL);
+      ASSERT_EQ(n->outputs.size(), 0UL);
     }
   }
   ASSERT_EQ(nodes.size(), 5);
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index db95861c510b52a5b52229541434e6437d3fb9f4..3e17a512ce154de88ac890f3b29f03385595d95c 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
             "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
+
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
+                               LibraryType::kCUDNN);
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
+            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "type[CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922..2288c7fe6609a765612b468d69ad35101b92b384 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
+void OpProtoAndCheckerMaker::Reuse(const std::string& name,
+                                   const std::string& reused_name) {
+  bool found = false;
+  proto::OpProto::Var* var;
+
+  for (auto& var : proto_->inputs()) {
+    if (var.name() == reused_name) {
+      found = true;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+
+  found = false;
+  for (int i = 0; i < proto_->outputs().size(); ++i) {
+    var = proto_->mutable_outputs()->Mutable(i);
+    if (var->name() == name) {
+      PADDLE_ENFORCE(!var->has_reuse(),
+                     "Output(%s) has been set reused var of %s", name,
+                     var->reuse());
+      found = true;
+      var->set_reuse(reused_name);
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+}
+
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 92f86bb5de520878d0a7b8d7214620580242c061..80970291c9c234f1306162f4ffa3c2528f88c35f 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
+  void Reuse(const std::string &name, const std::string &reused_name);
+
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index 58f70cb39c0d96ed3b9ff35ea132ba75a37f5405..b71c7b646857e11f291748c4c7c2af92b6d53231 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
 }
 
 class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+  }
+};
+
+class TestInplaceProtoMaker2
+    : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "input of test op");
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto, op_proto2;
   paddle::framework::OpAttrChecker op_checker;
   TestInplaceProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  TestInplaceProtoMaker2 proto_maker2;
+
+  proto_maker(&op_proto, &op_checker);
+
+  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
                paddle::platform::EnforceNotMet);
-  // proto_maker(&op_proto, &op_checker);
-  // proto_maker.Make();
-  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
+
+// normal reuse
+class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddInput("Y", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddOutput("XOut", "output of test op");
+    // avoid destructor exception.
+    // Validate();
+    TestReuse();
+  }
+
+  virtual void TestReuse() {}
+};
+
+// test duplicate reuse error
+class TestReuseProtoMaker2 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "X");
+    Reuse("Out", "Y");
+  }
+};
+
+// NotExists Input
+class TestReuseProtoMaker3 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "NotExists");
+    Reuse("XOut", "X");
+  }
+};
+
+// NotExists Output
+class TestReuseProtoMaker4 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() { Reuse("NotExists", "X"); }
+};
+
+TEST(ProtoMaker, Reuse) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker proto_maker;
+  proto_maker(&op_proto, &op_checker);
+}
+
+// NOTE(dzhwinter):
+// There is a Fatal CHECK on base class destructor, which will call abort inside
+// instead of
+// throw an exception. If we throw an exception in Make(), we will trigger the
+// CHECK and terminate the tests.
+//
+// I had tried to replace the default CHECK with a exception, however, it's
+// still not supported by glog.
+// the details:
+// https://github.com/google/glog/issues/249
+// https://github.com/facebookresearch/TensorComprehensions/issues/351
+/*
+TEST(ProtoMaker, ReuseWithException) {
+  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker2 proto_maker2;
+  TestReuseProtoMaker3 proto_maker3;
+  TestReuseProtoMaker4 proto_maker4;
+  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+void FailureFunction() {
+  throw std::runtime_error("Check failed in destructor.");
+  // return 0;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::InstallFailureFunction(&FailureFunction);
+  return RUN_ALL_TESTS();
+}
+*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7c1c29fd9a81c558f7fd05abf52cd0a6dd522190..cdac00739bc48648b41751e644a953d0d310ffbf 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
+    return tensor.dims();
   } else if (var->IsType<SelectedRows>()) {
     if (get_actual_dim) {
       return var->Get<SelectedRows>().value().dims();
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static std::string GetDtype(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return "";
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "";
+    }
+    return DataTypeToString(ToDataType(tensor.type()));
+  } else if (var->IsType<SelectedRows>()) {
+    return DataTypeToString(
+        ToDataType(var->Get<SelectedRows>().value().type()));
+  } else {
+    return "";
+  }
+}
+
 static int GetRowSize(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return default_lod;
+    }
+    return tensor.lod();
   } else {
     return default_lod;
   }
@@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
         if (row_size >= 0) {
           ss << "[row_size=" << row_size << "]";
         }
+        std::string dtype = GetDtype(*scope, input.second[i]);
+        ss << ":" << dtype;
         ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index ef224d68f1fc561f45e9d7a81425e62655457648..0bbfd66148e9bc9080654bf1b0b34477115a0e6b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -82,7 +82,7 @@ class Tensor {
   template <typename T>
   const T* data() const;
 
-  bool IsInitialized() const;
+  inline bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 0a1cb6d5703dace5e6be73285655ecd9d2ad89fb..cb2061c06a429d8e8116001a4aa4e8c46ea13428 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 #include <gtest/gtest.h>
 #include <string>
+#include "paddle/fluid/platform/float16.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
+
+TEST(Tensor, FP16) {
+  using platform::float16;
+  framework::Tensor src;
+  float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float16>(i);
+  }
+  EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
+  // EXPECT a human readable error message
+  // src.data<uint8_t>();
+  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
+  // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
+}
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 67d355d10d3c9e11b59c9ce9d208826523095459..27fe575cb6167a726ff92a8f3d2e47b6f536ba39 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
   tensorrt_subgraph_node_mark_pass.cc
   analyzer.cc
   helper.cc
+        model_store_pass.cc
   DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
+inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index b3a1075e5adf4a24bf32017574c061f36c46ba8c..98bdfcc00b9f0e8f40dfc92e4021b2bd6fb19313 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+DEFINE_string(inference_analysis_output_storage_path, "",
+              "optimized model output path");
+
 namespace inference {
 namespace analysis {
 
@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 0132bf5b9c6552391aaa19542669487f42b685a7..c82fdfff86c91b4e07e3c1b80987d3d8d796ad23 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -16,28 +16,23 @@ limitations under the License. */
 
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize
- * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
- * control whether
- * an process is applied on the program.
+ * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
+ * multiple flags to
+ * control whether an process is applied on the program.
  *
  * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first
- * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
- * a data flow
- * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
- * graph to a
- * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
- * which take a
- * node or data flow graph as input.
+ * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
+ * Fluid ProgramDesc to
+ * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
+ * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
+ * pipeline can be any Passes
+ * which take a node or data flow graph as input.
  *
  * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to
- * pre-process the inference model and can be controlled by passing difference
- * command flags;
+ * can be used to pre-process the inference model and can be controlled by
+ * passing difference command flags;
  * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the
- * inference service.
+ * phase in the inference service.
  */
 
 #include <gflags/gflags.h>
@@ -50,6 +45,7 @@ namespace paddle {
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
+DECLARE_string(inference_analysis_output_storage_path);
 
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1fe3eb797cdced56a61aa2db0c3d18601824f8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_main.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements analysizer -- an executation help to analyze and
+ * optimize trained model.
+ */
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  using paddle::inference::analysis::Analyzer;
+  using paddle::inference::analysis::Argument;
+
+  Argument argument;
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 25a440e7e71fddb38cc515f99d15231675a8172e..24bfb3993cf569561980006b6627b56327dd0f67 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -20,14 +20,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, analysis_without_tensorrt) {
+TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
-TEST_F(DFG_Tester, analysis_with_tensorrt) {
+TEST(Analyzer, analysis_with_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 6d316f20bff7a68754b0afec6463bd5d7579227f..a17d6281a2976f0600c7ce94c2d43e65d30de265 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
@@ -36,6 +37,16 @@ namespace analysis {
  * All the fields should be registered here for clearness.
  */
 struct Argument {
+  Argument() = default;
+  explicit Argument(const std::string& fluid_model_dir)
+      : fluid_model_dir(new std::string(fluid_model_dir)) {}
+  // The directory of the trained model.
+  std::unique_ptr<std::string> fluid_model_dir;
+  // The path of `__model__` and `param`, this is used when the file name of
+  // model and param is changed.
+  std::unique_ptr<std::string> fluid_model_program_path;
+  std::unique_ptr<std::string> fluid_model_param_path;
+
   // The graph that process by the Passes or PassManagers.
   std::unique_ptr<DataFlowGraph> main_dfg;
 
@@ -44,6 +55,9 @@ struct Argument {
 
   // The processed program desc.
   std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+
+  // The output storage path of ModelStorePass.
+  std::unique_ptr<std::string> model_output_store_path;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index bc1875f4d851c5d28d290357d94528fe3303f631..16aeae4d35e7bd54646053190da7f47eaca69aa0 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -176,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86..a881262665f156812da9e1576aa29b05fc398499 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -20,7 +20,7 @@ namespace inference {
 namespace analysis {
 
 TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
 }
 
 TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
   GraphTraits<DataFlowGraph> trait(&dfg);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -26,21 +26,21 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Test) {
-  DataFlowGraph graph;
+TEST(DataFlowGraph, Test) {
+  Argument argument(FLAGS_inference_model_dir);
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
   ASSERT_TRUE(pass0.Initialize(&argument));
   ASSERT_TRUE(pass1.Initialize(&argument));
 
-  pass0.Run(&graph);
-  pass1.Run(&graph);
+  pass0.Run(argument.main_dfg.get());
+  pass1.Run(argument.main_dfg.get());
 
   pass0.Finalize();
   pass1.Finalize();
 
-  LOG(INFO) << graph.nodes.size();
+  LOG(INFO) << argument.main_dfg->nodes.size();
 }
 
 };  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 65842b1e850953e77e3d4d28416609be271af9f1..928be7917047382d9b86294f6039b26b0ebf6f49 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -23,12 +23,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
+  Argument argument(FLAGS_inference_model_dir);
+  FluidToDataFlowGraphPass pass0;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  pass0.Run(argument.main_dfg.get());
+
+  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+
   DFG_GraphvizDrawPass::Config config("./", "test");
   DFG_GraphvizDrawPass pass(config);
   pass.Initialize(&argument);
-  pass.Run(&dfg);
+  pass.Run(argument.main_dfg.get());
 
   // test content
   std::ifstream file("./0-graph_test.dot");
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 88fdf8c9cb4ce5369d70d416bbcfe6a4c7f23a98..511631d3e067f14bc1230d9e4b4d92dbe604e1d4 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include <string>
 #include <vector>
 
@@ -25,8 +26,20 @@ namespace analysis {
 
 bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
-  PADDLE_ENFORCE(argument);
+  if (argument->origin_program_desc) {
+    LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                    "duplicate called";
+  }
+  if (!argument->fluid_model_program_path) {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+    argument->fluid_model_program_path.reset(
+        new std::string(*argument->fluid_model_dir + "/__model__"));
+  }
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+  argument->origin_program_desc.reset(
+      new framework::proto::ProgramDesc(program));
+
   if (!argument->main_dfg) {
     argument->main_dfg.reset(new DataFlowGraph);
   }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index dadb84059d21adab44159a6145b345460663cb96..d218dcd05015aa4636c16569de4addf4936c8cd5 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -21,8 +21,9 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Init) {
+TEST(FluidToDataFlowGraphPass, Test) {
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   pass.Initialize(&argument);
   pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f1064cd20f28092d80d3fd23a862da080b6cc2f3..a0f912b251d5ea29594a7f601d5b2bce91201790 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
   }
 }
 
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string &model_path) {
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(buffer);
+  return program_desc;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c429176424bd5c1d8fa5e015c19d698f966880e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void ModelStorePass::Run(DataFlowGraph *x) {
+  if (!argument_->fluid_model_param_path) {
+    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
+    argument_->fluid_model_param_path.reset(
+        new std::string(*argument_->fluid_model_dir + "param"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
+  // Directly copy param file to destination.
+  std::stringstream ss;
+  // NOTE these commands only works on linux.
+  ss << "mkdir -p " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+  ss.str("");
+
+  ss << "cp " << *argument_->fluid_model_dir << "/*"
+     << " " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+
+  // Store program
+  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
+                          "program desc is not transformed, should call "
+                          "DataFlowGraphToFluidPass first.");
+  const std::string program_output_path =
+      *argument_->model_output_store_path + "/__model__";
+  std::ofstream file(program_output_path, std::ios::binary);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
+                 program_output_path);
+  const std::string serialized_message =
+      argument_->transformed_program_desc->SerializeAsString();
+  file.write(serialized_message.c_str(), serialized_message.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fac7083925776b6209d49255c9e67b930cb1250b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines ModelStorePass, which store the runtime DFG to a Paddle
+ * model in the disk, and that model can be reloaded for prediction.
+ */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class ModelStorePass : public DataFlowGraphPass {
+ public:
+  bool Initialize(Argument* argument) override {
+    if (!argument) {
+      LOG(ERROR) << "invalid argument";
+      return false;
+    }
+    argument_ = argument;
+    return true;
+  }
+
+  void Run(DataFlowGraph* x) override;
+
+  std::string repr() const override { return "DFG-store-pass"; }
+  std::string description() const override {
+    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
+    model in the disk, and that model can be reloaded for prediction again.)DD";
+  }
+
+ private:
+  Argument* argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3526dd504e77e58d79b4f675db86a22fd0f26b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "Model path");
+
+TEST(DFG_StorePass, test) {
+  Analyzer analyzer;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(
+      new std::string("./_dfg_store_pass_tmp"));
+  // disable storage in alalyzer
+  FLAGS_inference_analysis_output_storage_path = "";
+  analyzer.Run(&argument);
+
+  ModelStorePass pass;
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..6806f9ff7dada2c1e2328e1ffbfd225afefcf474 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -50,6 +50,7 @@ class Pass {
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
   virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
 
+  virtual void Run() { LOG(FATAL) << "not valid"; }
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index dac1c509d728114bd24a2ea1150c407646026fd4..13423e4837e12a96e7a5dfc9ca3f59bf8b14746a 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
   std::string description() const override { return "some doc"; }
 };
 
-TEST_F(DFG_Tester, DFG_pass_manager) {
+TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
 
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  Argument argument(FLAGS_inference_model_dir);
+
   ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
 
-TEST_F(DFG_Tester, Node_pass_manager) {
+TEST(PassManager, Node_pass_manager) {
+  Argument argument(FLAGS_inference_model_dir);
   // Pre-process: initialize the DFG with the ProgramDesc first.
   FluidToDataFlowGraphPass pass0;
   pass0.Initialize(&argument);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 67dd4da54b95add703428e1fded61065f60353e8..39cc433b40fad17f4f12359d4e907a250a88bd63 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
   return false;
 };
 
-TEST_F(DFG_Tester, Split) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Split) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
-TEST_F(DFG_Tester, Fuse) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Fuse) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
 
   size_t count0 = dfg.nodes.size();
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+TEST(TensorRTSubgraphNodeMarkPass, test) {
   // init
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   ASSERT_TRUE(pass.Initialize(&argument));
-  argument.main_dfg.reset(new DataFlowGraph);
   pass.Run(argument.main_dfg.get());
 
   TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
   for (auto& node : argument.main_dfg->nodes.nodes()) {
     counter += node->attr(ATTR_supported_by_tensorrt).Bool();
   }
-
+  ASSERT_EQ(counter, 2);
   LOG(INFO) << counter << " nodes marked";
 }
 
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -25,7 +25,7 @@ namespace analysis {
 
 DEFINE_string(dot_dir, "./", "");
 
-TEST_F(DFG_Tester, tensorrt_single_pass) {
+TEST(TensorRTSubGraphPass, main) {
   std::unordered_set<std::string> teller_set(
       {"elementwise_add", "mul", "sigmoid"});
   SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
     return false;
   };
 
-  LOG(INFO) << "init";
+  Argument argument(FLAGS_inference_model_dir);
+
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
   FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
 
-  LOG(INFO) << "Initialize";
   dfg_pass.Initialize(&argument);
   dfg_pass1.Initialize(&argument);
   pass0.Initialize(&argument);
   trt_pass.Initialize(&argument);
 
-  LOG(INFO) << "Run";
   argument.main_dfg.reset(new DataFlowGraph);
   pass0.Run(argument.main_dfg.get());
   dfg_pass.Run(argument.main_dfg.get());
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -32,27 +32,12 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string& model_dir = FLAGS_inference_model_dir) {
-  std::string msg;
-  std::string net_file = FLAGS_inference_model_dir + "/__model__";
-  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
-  fin.seekg(0, std::ios::end);
-  msg.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(msg.at(0)), msg.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(msg);
-  return program_desc;
-}
-
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
   Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   pass.Initialize(&argument);
   pass.Run(&graph);
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
 class DFG_Tester : public ::testing::Test {
  protected:
   void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
     argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   }
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 3e60a61793339990648737c3d549d46cc5f5a887..259d79bedbf664f52b1189ca71567665a6d91180 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -19,6 +19,7 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
 endif()
@@ -63,6 +64,8 @@ endif()
 if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    fetch_include_recursively(${ANAKIN_INCLUDE})
     # compile the libinference_anakin_api.a and anakin.so.
     nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
     nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
     if (WITH_TESTING)
         cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
                                   ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api)
+                                  DEPS inference_anakin_api_shared)
         target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
      endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 0206ac60103759deda91be741617bde63e003de6..6b374ceefbc180a5c22abe591f12e1c3d89bc64a 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -18,26 +18,36 @@
 
 namespace paddle {
 
-PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
     const AnakinConfig &config) {
   CHECK(Init(config));
 }
 
-bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
+    LOG(FATAL) << "fail to load graph from " << config.model_file;
     return false;
   }
-  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+  }
   // optimization for graph
   if (!(graph_.Optimize())) {
     return false;
   }
   // construct executer
-  executor_.init(graph_);
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
+  }
   return true;
 }
 
-bool PaddleInferenceAnakinPredictor::Run(
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
     const std::vector<PaddleTensor> &inputs,
     std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
                  << "'s type is not float";
       return false;
     }
-    auto d_tensor_in_p = executor_.get_in(input.name);
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->valid_shape();
+    if (net_shape.size() != input.shape.size()) {
+      LOG(ERROR) << " input  " << input.name
+                 << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
     float *d_data_p = d_tensor_in_p->mutable_data();
     if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
                    d_tensor_in_p->valid_size() * sizeof(float),
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
     }
     cudaStreamSynchronize(NULL);
   }
-
-  executor_.prediction();
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
 
   if (output_data->empty()) {
     LOG(ERROR) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
-    auto *tensor = executor_.get_out(output.name);
-    output.shape = tensor->shape();
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
     if (output.data.length() < tensor->valid_size() * sizeof(float)) {
       output.data.Resize(tensor->valid_size() * sizeof(float));
     }
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
   return true;
 }
 
-anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-    &PaddleInferenceAnakinPredictor::get_executer() {
-  return executor_;
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
 }
 
 // the cloned new Predictor of anakin share the same net weights from original
 // Predictor
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+template <typename Target>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target>::Clone() {
   VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target>());
   // construct executer from other graph
   auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
   return std::move(cls);
 }
 
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
+
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
   VLOG(3) << "Anakin Predictor create.";
-  std::unique_ptr<PaddlePredictor> x(
-      new PaddleInferenceAnakinPredictor(config));
-  return x;
-}
+  if (config.target_type == AnakinConfig::NVGPU) {
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+  } else if (config.target_type == AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
+  }
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index def096c867ec85624f5b221782ef8b6240923c05..836badd9799228c6c294dcad5df73d039d36a1ff 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -20,14 +20,16 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-// from anakin
 #include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
 namespace paddle {
 
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
   PaddleInferenceAnakinPredictor() {}
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override{};
+  ~PaddleInferenceAnakinPredictor() override {
+    delete executor_p_;
+    executor_p_ = nullptr;
+  };
 
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
-                       anakin::Precision::FP32>
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
       graph_;
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      executor_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
   AnakinConfig config_;
 };
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
index d6d631bfbad4278fe99e4553a410a9d9162dcc7b..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
 AnakinConfig GetConfig() {
   AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor{.name = "input_0",
-                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = PaddleBuf(data, sizeof(data)),
-                      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.emplace_back(std::move(tensor));
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
 
-  PaddleTensor tensor_out{.name = "prob_out",
-                          .shape = std::vector<int>({1000, 1}),
-                          .data = PaddleBuf(),
-                          .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor_out;
+  tensor_out.name = "prob_out";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
 
-  std::vector<PaddleTensor> outputs;
-  outputs.emplace_back(std::move(tensor_out));
+  std::vector<PaddleTensor> outputs(1, tensor_out);
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < 1000; ++j) {
+  for (size_t j = 0; j < outputs[0].data.length(); ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 58fd7c6f8b05a846bd4a82068f09f5d9ef5a6516..08d7af6d3af7054061b15b904c69b2862c629562 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
     std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                 inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+
     feeds->push_back(input);
   }
   return true;
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
       buffer.Resize(sizeof(float) * data.size());
     }
     std::memcpy(buffer.data(), data.data(), buffer.length());
+    // copy LoD
+    for (const auto &level : fetchs[i].lod()) {
+      outputs->at(i).lod.emplace_back(level);
+    }
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index c0891e9c281961fa03d278a0f5c676f92672c419..45b5a7638b7dc6a54bbd905766fd5c284cb6aea1 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
     argument.origin_program_desc.reset(
         new ProgramDesc(*inference_program_->Proto()));
     Singleton<Analyzer>::Global().Run(&argument);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index 62d98a796708612e7d4ff8abfd85125978ce22c7..fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
     std::vector<int64_t> data(20);
     for (int i = 0; i < 20; i++) data[i] = i;
 
-    PaddleTensor tensor{
-        .name = "",
-        .shape = std::vector<int>({10, 1}),
-        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
-        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({10, 1});
+    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 5f96fecf93f7a6c42bc6b9fe4e0d985c626388d7..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
         // 3. Run
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 0a2a2b713ab21a3124d8a85ba469f64278623ec4..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
 #include <fstream>
 #include <iostream>
+#include "paddle/fluid/inference/demo_ci/utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "utils.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
   file.close();
 
   // Inference.
-  PaddleTensor input{
-      .name = "xx",
-      .shape = record.shape,
-      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
-      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
 
   VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 2f8b4f8596946988a728b5cf82de251bfda778a9..59b0df7968cce137843ba8cad38a62fdb8d3bfc1 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -44,7 +44,7 @@ class PaddleBuf {
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -67,9 +67,9 @@ struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
-  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::vector<std::vector<uint64_t>> lod;  // lod data
 };
 
 enum class PaddleEngineKind {
@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
 
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
   int device;
   std::string model_file;
   int max_batch_size{-1};
+  TargetType target_type;
 };
 
 struct TensorRTConfig : public NativeConfig {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 3864f337bdadc61e7531304e2cf2ee52a25253f2..d86c046f2e5b08a4c00cf6cad19627e6a196c798 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
   DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
-
+nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+
+nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99..dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -20,11 +20,60 @@ namespace tensorrt {
 
 class Conv2dOpConverter : public OpConverter {
  public:
-  Conv2dOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
+    const int n_output = Y_t->dims()[0];
+    const int filter_h = Y_t->dims()[2];
+    const int filter_w = Y_t->dims()[3];
+
+    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+    const std::vector<int> dilations =
+        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+    const std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
+        nv_ksize, weight.get(), bias.get());
+    PADDLE_ENFORCE(layer != nullptr);
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+    layer->setDilation(nv_dilations);
+    layer->setNbGroups(groups);
+
+    auto output_name = op_desc.Output("Output").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3744550f60a1696aedd8a3ecd24f1b21d22325b9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ElementwiseWeightOpConverter : public OpConverter {
+ public:
+  ElementwiseWeightOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+
+    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
+      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
+    }
+
+    if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
+               dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 1; i < dims_x.nbDims; i++) {
+        if (dims_y[i] != dims_x.d[i]) {
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_x.nbDims; i++) {
+          if (dims_y[i] != 1)
+            PADDLE_THROW(
+                "TensorRT unsupported weight shape for Elementwise op!");
+        }
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
+    }
+
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(weight_data),
+                                         Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
+        shift_weights.get(), scale_weights.get(), power_weights.get());
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+class ElementwiseTensorOpConverter : public OpConverter {
+ public:
+  ElementwiseTensorOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    // The two input tensor should have the same dims
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    if (dims_x.nbDims == dims_y.nbDims) {
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        if (dims_x.d[i] != dims_y.d[i])
+          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+    }
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong elementwise op type!");
+    }
+    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+
+ protected:
+  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+      ops;
+  std::string op_type_;
+};
+
+const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+    ElementwiseTensorOpConverter::ops = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"mul", nvinfer1::ElementWiseOperation::kPROD},
+        {"sub", nvinfer1::ElementWiseOperation::kSUB},
+        {"div", nvinfer1::ElementWiseOperation::kDIV},
+        {"min", nvinfer1::ElementWiseOperation::kMIN},
+        {"pow", nvinfer1::ElementWiseOperation::kPOW},
+        {"max", nvinfer1::ElementWiseOperation::kMAX},
+};
+
+class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
+};
+
+class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
+};
+
+class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
+};
+
+class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
+};
+
+class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
+};
+
+class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
+};
+
+class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
+                          ElementwiseTensorAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
+                          ElementwiseTensorSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
+                          ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
+                          ElementwiseTensorMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
+                          ElementwiseTensorMaxOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
+                          ElementwiseTensorMinOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
+                          ElementwiseTensorPowOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 409efac6799b6fb8d27a1343a55e7a508760868f..39fe1f609d7b94638506877fc301f19ef33ec8ac 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
 }
 // indata c * k
 // Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                    TensorRTEngine::Weight* oweights) {
   int c = iweights.dims[0];
   int k = iweights.dims[1];
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 968f7eb99ce8519edaa585fd3cb642bd80cc63cc..1b6a0ad82f3ceb00cec15c28c8121adc22271b7a 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -55,6 +55,31 @@ class OpConverter {
         it = Registry<OpConverter>::Lookup("fc");
       }
     }
+
+    if (op_desc.Type().find("elementwise") != std::string::npos) {
+      static std::unordered_set<std::string> add_tensor_op_set{
+          "add", "mul", "sub", "div", "max", "min", "pow"};
+      // TODO(xingzhaolong): all mul, sub, div
+      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
+      // "sub", "div"};
+      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      int op_type_len = op_desc.Type().size();
+      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+      } else {
+        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
+      }
+    }
+
     if (!it) {
       it = Registry<OpConverter>::Lookup(op_desc.Type());
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8711c6b60d74639529624c25429bc245de46479
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(conv2d_op, test) {
+  std::unordered_set<std::string> parameters({"conv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
+  validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d");
+  desc.SetInput("Input", {"conv2d-X"});
+  desc.SetInput("Filter", {"conv2d-Y"});
+  desc.SetOutput("Output", {"conv2d-Out"});
+
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(conv2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7537d02a35b66a41c158cd8eb1b1e5d4107e7d84
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(elementwise_op, add_weight_test) {
+  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+TEST(elementwise_op, add_tensor_test) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  // the defalut axis of elementwise op is -1
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(elementwise_add);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9b79f86b0edba983019bd932f52b08711ff36d41..d6651a5b244ba31a01220e6299cb2016ae61fe64 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* conv2d_op = block->AppendOp();
+
+  // init trt engine
+  cudaStream_t stream_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
+  engine_->InitNetwork();
+  PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+
+  engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
+                        nvinfer1::Dims3(2, 5, 5));
+
   conv2d_op->SetType("conv2d");
+  conv2d_op->SetInput("Input", {"conv2d-X"});
+  conv2d_op->SetInput("Filter", {"conv2d-Y"});
+  conv2d_op->SetOutput("Output", {"conv2d-Out"});
 
-  OpConverter converter;
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  conv2d_op->SetAttr("strides", strides);
+  conv2d_op->SetAttr("paddings", paddings);
+  conv2d_op->SetAttr("dilations", dilations);
+  conv2d_op->SetAttr("groups", groups);
+
+  // init scope
   framework::Scope scope;
-  converter.ConvertBlock(*block->Proto(), {}, scope,
-                         nullptr /*TensorRTEngine*/);
+  std::vector<int> dim_vec = {3, 2, 3, 3};
+  auto* x = scope.Var("conv2d-Y");
+  auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize(framework::make_ddim(dim_vec));
+
+  OpConverter converter;
+  converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
+                         engine_.get() /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 39529cc2c799212f91107b1b86dd2c8c3642b6da..63c2f978f253df11100ecca83acae5eab6a0337d 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -149,7 +149,7 @@ class TRTConvertValidation {
     cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 2000;
+    const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index b3781ded01c09edd59df09fd064b37052ad0333a..22cbf680c0670552fb014043c69fcadc56863529 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_deterministic, true,
+DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
-            "false, the algorithm is deterministic.");
+            "true, the algorithm is deterministic.");
 
 namespace paddle {
 namespace operators {
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_deterministic) {
+      if (!FLAGS_cudnn_deterministic) {
         CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_deterministic) {
+      if (!FLAGS_cudnn_deterministic) {
         CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 5098bd8700e11c9a2faeba90c38ed2d9499b17cf..0511eb42a073ac305634110a71a35e501f062132 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p,
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
     auto user_pd = user_weights_memory_p->get_primitive_desc();
     return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   // Generate keys for storing/retriving primitives for this operator
   // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,
-                             memory::dims& weights_dims,
-                             std::vector<int>& strides,
-                             std::vector<int>& paddings,
-                             std::vector<int>& dilations, int groups,
-                             const std::string& suffix) {
+  static std::string GetHash(memory::dims& input_dims,     // NOLINT
+                             memory::dims& weights_dims,   // NOLINT
+                             std::vector<int>& strides,    // NOLINT
+                             std::vector<int>& paddings,   // NOLINT
+                             std::vector<int>& dilations,  // NOLINT
+                             int groups, const std::string& suffix) {
     return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
            dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
            suffix;
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index 1a5427b39241b666eeaf12b173ea00443bb5f6e4..c86cd57316078778e5930c9b524b931d523028d7 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_dims_untrimed = y->dims();
     auto z_dims = z->dims();
 
     // Execute default elementwise_add operator when
     // broadcast operations need to performed.
-    if (x_dims != y_dims) {
+    if (x_dims != y_dims_untrimed) {
       auto sum_func = [](T a, T b) -> T { return a + b; };
 
       TransformFunctor<decltype(sum_func), T,
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
               ctx.template device_context<paddle::platform::CPUDeviceContext>(),
               sum_func);
 
-      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
       PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                      "Axis should be in range [0, x_dims)");
 
-      trim_trailing_singular_dims(&y_dims);
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
       axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
       int pre, n, post;
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
                      "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
-      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
       std::vector<int> dst_tz = framework::vectorize2int(z_dims);
 
       std::vector<memory::primitive_desc> srcs_pd;
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+    // skip out, x, y,
+    // dout length is larger or equal than dx, dy.
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
     auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
       in->set_layout(DataLayout::kMKLDNN);
       in->set_format(out->format());
     };
 
-    if (x->dims() == y->dims()) {
-      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
-      if (dx) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dx->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dx, dout);
-      }
-
-      if (dy) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dy->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dy, dout);
+    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
+      if (dx->dims() == dy->dims()) {
+        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+        if (dx) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dx->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dx, dout);
+        }
+
+        if (dy) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dy->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dy, dout);
+        }
       }
     } else {
       // Execute default kernel when broadcast is needed
-      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
-                          IdentityGrad<T>, IdentityGrad<T>>(
+      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
+                                  IdentityGrad<T>, IdentityGrad<T>>(
           ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
           IdentityGrad<T>());
     }
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index d2c20537136fc3ac9d1bece24a2238f26215c922..3c97ac995c649ecd0d196a584240e1e7ac04f08e 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
+                              "X");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index baf04c30b17cb333fc8a6544afd6c479442f835b..5356105e2e551c0528694091608fc7585dce66d2 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
                                   framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
 
-  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-      IdentityGrad<T>());
+  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
+                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
+                                               dx, dy, IdentityGrad<T>(),
+                                               IdentityGrad<T>());
 }
 
 template <typename DeviceContext, typename T>
@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
-    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
+    if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
+        dy != nullptr && (dx->dims() == dy->dims())) {
       elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
     } else {
       default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 824b1221e5a77c8799dc34820b7f0db180c2439e..84c8a65e5f859d276ae6d5f1a3f25c9d713a7a61 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
+
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index bb88970e42c194d9437609b62435f1a89e2b446b..d8a12e800ad733800c1ec333f15d31d4dcd1a3a5 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.").Reuse("X");
+    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
+    // memory.").AsIntermediate();
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
+    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
+  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
+
+// For Add, Sub op, the X, Out is not needed.
+class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
+ public:
+  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
+  using operators::ElementwiseOpGrad::GetExpectedKernelType;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, out_dims);
+    }
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
+/*
+*/
+
+#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
+  class kernel_type##GradMaker                                               \
+      : public paddle::framework::SingleGradOpDescMaker {                    \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
+      auto* op = new paddle::framework::OpDesc();                            \
+      op->SetType(#kernel_type "_grad");                                     \
+      op->SetInput("Y", Input("Y"));                                         \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+      op->SetAttrMap(Attrs());                                               \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 #define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
   class __ElemwiseOp##op_type##Maker__                                  \
       : public ::paddle::operators::ElementwiseOpMaker {                \
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
                     ::paddle::operators::ElementwiseOpInferVarType,     \
                     ::paddle::framework::DefaultGradOpDescMaker<true>); \
   REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
+
+#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
+  class __ElemwiseOp##op_type##Maker__                                 \
+      : public ::paddle::operators::ElementwiseOpMaker {               \
+   protected:                                                          \
+    virtual std::string GetName() const { return op_name; }            \
+    virtual std::string GetEquation() const { return equation; }       \
+    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
+  };                                                                   \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
+                    __ElemwiseOp##op_type##Maker__,                    \
+                    ::paddle::operators::ElementwiseOpInferVarType,    \
+                    op_type##GradMaker);                               \
+  REGISTER_OPERATOR(op_type##_grad,                                    \
+                    ::paddle::operators::ElementwiseOpExplicitGrad)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 8b052611f80ddf874ca48c1c58e13346528a834e..eb8272e90c32c3a0be2c0ce1bc679571af876317 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
 #include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
-inline void trim_trailing_singular_dims(framework::DDim* dims) {
+inline framework::DDim trim_trailing_singular_dims(
+    const framework::DDim& dims) {
   // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
+  auto actual_dims_size = dims.size();
   for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
+    if (dims[actual_dims_size - 1] != 1) break;
   }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
+
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
   }
+  framework::DDim actual_dims = framework::make_ddim(trim_dims);
+  return actual_dims;
 }
 
 template <typename T, typename DeviceContext>
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
 
 #endif
 
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeNoBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
+      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
+      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeWithBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim_untrimed, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast1CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast1CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast2CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast2CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
+          dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
 void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& x, const framework::Tensor& y,
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& dout, int axis,
                          framework::Tensor* dx, framework::Tensor* dy,
                          DX_OP dx_op, DY_OP dy_op) {
+  const framework::DDim x_dim = x.dims();
+  const framework::DDim y_dim = y.dims();
   if (x.dims() == y.dims()) {
-    size_t N = static_cast<size_t>(framework::product(x.dims()));
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(), N);
-    for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
-        x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {  // Y is a scalar
-    auto x_dim = x.dims();
-    auto y_dim = y.dims();
-
-    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(&y_dim);
-    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-    int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-    if (post == 1) {
-      int h = pre;
-      int w = n;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast1CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast1CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w,
-            dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast2CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-            dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast2CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
-            post, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
+    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
+                                 const framework::Tensor& x,
+                                 const framework::Tensor& y,
+                                 const framework::Tensor& out,
+                                 const framework::Tensor& dout, int axis,
+                                 framework::Tensor* dx, framework::Tensor* dy,
+                                 DX_OP dx_op, DY_OP dy_op) {
+  if (dy == nullptr) {
+    const framework::DDim dx_dims = dout.dims();
+    auto dy_dims = dx_dims;
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    if (dout.dims() == dy->dims()) {
+      const framework::DDim dx_dims = dout.dims();
+      const framework::DDim dy_dims = dy->dims();
+      ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    } else {  // Y is a scalar
+      auto dx_dims = dout.dims();
+      const framework::DDim dy_dims = dy->dims();
+      ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     }
   }
 }
 
+// Deprecated
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(&y_dims);
+  trim_trailing_singular_dims(y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
       x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
                     "Rank of first input must >= rank of second input.");
 
-  if (x_dims == y_dims) {
+  if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index a7562b166b373ee2a8c9b6f379431d88d3e45fcb..b7224261e6a7ca82dff92a25f5fe8818c08e676d 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
+                              "X");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index fe088b8203722a43b9aba7be3878b8f4ca68ba12..11c7e3fe628001f095836a788f2bcc7c4ee7ad4b 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
+
+    ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 31a7458f637921c290fc71ac748143867b4aae19..fefc7125b4de7274589670d29be4511469d5064a 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx->GetInputDim(framework::GradVarName("Out")),
                       "Input(Out) and its gradients should have a same shape.");
 
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("softmax_grad");
+
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0f334b2892d77b836728cf79898d1832e90e7c00..a8bc16f1b5b9b624e88e355d8ce4741fcec34bc3 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -534,7 +534,7 @@ EOF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
         cp -r fluid_install_dir fluid
-        tar -cf fluid.tgz fluid
+        tar -czf fluid.tgz fluid
       fi
 }
 
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index d48c54fcbb66487617b1946bc69724870c8f879c..008509660739d61245526278735064472b8b06dd 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
         if drop_last == False and len(b) != 0:
             yield b
 
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+
     return batch_reader
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3ef4afa691b1dfba07fb132753f380727bb4f3ae..60a12686f8ff43f5ee7e30650a208296963bda3d 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -37,6 +37,7 @@ __all__ = [
 __auto__ = [
     'iou_similarity',
     'box_coder',
+    'polygon_box_transform',
 ]
 
 __all__ += __auto__
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9e97ec9a6f55680a2eb44ad712ac002df4fecda5..01db8645b3aff77371f01c3dec51c85f99065552 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -66,9 +66,7 @@ __all__ = [
     'scatter',
     'sum',
     'slice',
-    'polygon_box_transform',
     'shape',
-    'iou_similarity',
     'maxout',
 ] + __activations__
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 10028a8c6e33edcea27650d925ca7378b770f143..3bbd11d9836a62cdf9f2a84fc75e933001e12159 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -121,7 +121,7 @@ class ParallelExecutor(object):
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num
+                exec_strategy.num_threads = cpu_num * 2
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 43f68ff4592df6757691b06db52cf5e0e2ebc6d7..c8e881a672ad25654bd28604abfafc2c569af7ca 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -49,6 +49,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -61,4 +62,5 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee8020a73546cb9037e9dc4be589c62bb1b6b937
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import os
+import sys
+import transformer_model
+import paddle.dataset.wmt16 as wmt16
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+def get_model():
+    avg_cost = transformer(use_feed=False)
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(avg_cost)
+    return avg_cost
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistTransformer2x2(object):
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        avg_cost = get_model()
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
+
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            _ = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+
+    reader = paddle.batch(
+        wmt16.train(ModelHyperParams.src_vocab_size,
+                    ModelHyperParams.trg_vocab_size),
+        batch_size=transformer_model.batch_size)
+
+    with fluid.recordio_writer.create_recordio_writer(
+            WMT16_RECORDIO_FILE) as writer:
+        for batch in reader():
+            for tensor in prepare_batch_input(
+                    batch, ModelHyperParams.src_pad_idx,
+                    ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                t = fluid.LoDTensor()
+                t.set(tensor, fluid.CPUPlace())
+                writer.append_tensor(t)
+            writer.complete_append_tensor()
+
+    model = DistTransformer2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 82b5e7cf0b3633eb04ab97c5300b1926b9d47cb6..2ddfd47fe0c33b0e9771fe6f502b90eb77161100 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -66,6 +66,10 @@ def get_numeric_gradient(place,
         tensor_to_check_dtype = np.float32
     elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
         tensor_to_check_dtype = np.float64
+    elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
+        tensor_to_check_dtype = np.float16
+        # set delta as np.float16, will automatic convert to float32, float64
+        delta = np.array(delta).astype(np.float16)
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -73,13 +77,24 @@ def get_numeric_gradient(place,
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
 
     def __get_elem__(tensor, i):
-        if tensor_to_check_dtype == np.float32:
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            numpy_tensor = numpy_tensor.flatten()
+            return numpy_tensor[i]
+        elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         else:
             return tensor._get_double_element(i)
 
     def __set_elem__(tensor, i, e):
-        if tensor_to_check_dtype == np.float32:
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = e
+            numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
+            tensor.set(numpy_tensor, place)
+        elif tensor_to_check_dtype == np.float32:
             tensor._set_float_element(i, e)
         else:
             tensor._set_double_element(i, e)
@@ -133,6 +148,11 @@ class OpTest(unittest.TestCase):
         if not self.call_once:
             self.call_once = True
             self.dtype = data_type
+            # See the comment of np_dtype_to_fluid_dtype
+            # If the input type is uint16, we assume use float16
+            # for lodtensor dtype.
+            if self.dtype == np.uint16:
+                self.dtype == np.float16
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def infer_dtype(numpy_dict):
@@ -161,19 +181,25 @@ class OpTest(unittest.TestCase):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
-                        tensor.set(np_value[0], place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value[0]), place)
                         tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
-                        tensor.set(np_value, place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value), place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
-                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name][
+                            0]), place)
                     tensor.set_recursive_sequence_lengths(self.inputs[var_name][
                         1])
                 else:
-                    tensor.set(self.inputs[var_name], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name]),
+                        place)
                 feed_map[var_name] = tensor
 
         return feed_map
@@ -307,13 +333,22 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + "\n" + str(expect_t))
+                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                    str(actual_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
+        if self.dtype == np.float16:
+            if core.is_compiled_with_cuda() and core.op_support_gpu(
+                    self.op_type):
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    return [place]
+            else:
+                return []
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -344,9 +379,9 @@ class OpTest(unittest.TestCase):
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
                 return ("%s Variable %s max gradient diff %f over limit %f, "
-                        "the first error element is %d, %f, %f") % (
-                            msg_prefix, name, max_diff, max_relative_error,
-                            offset, a.flatten()[offset], b.flatten()[offset])
+                        "the first error element is %d, expected %f, but got %f"
+                        ) % (msg_prefix, name, max_diff, max_relative_error,
+                             offset, a.flatten()[offset], b.flatten()[offset])
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
@@ -435,6 +470,21 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
+    @staticmethod
+    def fluid_dtype_to_np_dtype(self, dtype):
+        """
+        See above, convert the dtype to normal type.
+        """
+        if dtype == np.uint16:
+            dtype = np.float16
+        return dtype
+
+    @staticmethod
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def _get_gradient(self,
                       input_to_check,
                       place,
@@ -457,7 +507,7 @@ class OpTest(unittest.TestCase):
             if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
             executor = fluid.ParallelExecutor(
-                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
         else:
             executor = Executor(place)
         return map(np.array,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aaab6f906ef6482bc515bb3c42d82431902e1d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import unittest
+import os
+import sys
+import signal
+import subprocess
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self, model_file):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 50
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error as e:
+                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
+                                 (e, retry_times))
+                retry_times -= 1
+
+    def check_with_place(self, model_file, delta=1e-3):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, model_file,
+             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env_local)
+        local_proc.wait()
+        out, err = local_proc.communicate()
+        local_ret = out
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver(model_file)
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = out
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index 3b67b3f5ccd67f86f87f292d83a6039ff46260bd..04671d079731ce414561b0ede6bc2b195b07d82a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -11,127 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import numpy as np
-import argparse
-import time
-import math
-
 import unittest
-import os
-import sys
-import signal
-import subprocess
-
-
-class TestDistSeResneXt2x2(unittest.TestCase):
-    def setUp(self):
-        self._trainers = 2
-        self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
-        self._python_interp = "python"
-
-    def start_pserver(self):
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return ps0_proc, ps1_proc
-
-    def _wait_ps_ready(self, pid):
-        retry_times = 20
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                retry_times -= 1
-
-    def test_with_place(self):
-        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
-        required_envs = {
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
-        }
-        # Run local to get a base line
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        env_local.update(required_envs)
-        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
-            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
-        local_proc = subprocess.Popen(
-            local_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env_local)
-        local_proc.wait()
-        out, err = local_proc.communicate()
-        local_ret = out
-        sys.stderr.write('local_loss: %s\n' % local_ret)
-        sys.stderr.write('local_stderr: %s\n' % err)
-
-        # Run dist train to compare with local results
-        ps0, ps1 = self.start_pserver()
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
-
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        env0.update(required_envs)
-        env1.update(required_envs)
-        FNULL = open(os.devnull, 'w')
-
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env0)
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env1)
-
-        tr0_proc.wait()
-        tr1_proc.wait()
-        out, err = tr0_proc.communicate()
-        sys.stderr.write('dist_stderr: %s\n' % err)
-        loss_data0 = out
-        sys.stderr.write('dist_loss: %s\n' % loss_data0)
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
+from test_dist_base import TestDistBase
 
-        self.assertAlmostEqual(local_first_loss, dist_first_loss)
-        self.assertAlmostEqual(local_last_loss, dist_last_loss)
 
-        # check tr0_out
-        # FIXME: ensure the server process is killed
-        # replace with ps0.terminate()
-        os.kill(ps0.pid, signal.SIGKILL)
-        os.kill(ps1.pid, signal.SIGKILL)
-        FNULL.close()
+class TestDistSeResneXt2x2(TestDistBase):
+    def test_se_resnext(self):
+        # TODO(paddle-dev): Is the delta too large?
+        self.check_with_place("dist_se_resnext.py", delta=0.2)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cd35d751dbce7eef9919dc8678fc0dd117757b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTransformer2x2(TestDistBase):
+    def test_transformer(self):
+        # TODO(paddle-dev): check if the delta is OK.
+        # Usually start around ~8000 and converge to ~5000
+        self.check_with_place("dist_transformer.py", delta=400)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index acf652d3fb9743d69b7f7e248ff7a3ee83fc4c50..1854232194963bcbe302010320a30d85747eea96 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index d090960c84e47da68a0ebea4609dfc3ed76e114e..daa5da8d95129af0305b326832a557daeb4c5c9c 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -17,6 +17,8 @@ import numpy as np
 import math
 from op_test import OpTest
 
+np.random.seed(100)
+
 
 def find_latest_set(num):
     return 1 + int(math.floor(math.log(num, 2)))
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ab2ab24f354c1fbdc8b5221061db56a8d8a48689..9e1b47643a554bc14170fc57ac05b21afdb8117a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_shape(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input, name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 76389d916fc39f470a22aed4792bf7b754600436..c7a039d2589ef67bd1d3771a2f11084698ba909f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        # FIXME(zcd): close this test temporally.
+        # self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
         self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b6215fddb11bb6b3a76b5a6395e7254d21971c13..8203d5d1fce0950130ab71db40fb306f73c41bd4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.dataset.wmt16 as wmt16
 import os
 
-WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
 
 
 class ModelHyperParams(object):
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
                     writer.append_tensor(t)
                 writer.complete_append_tensor()
 
-    @unittest.skip("transformer is buggy in multi gpu")
     def test_main(self):
         self.check_network_convergence(transformer, use_cuda=True)
-        self.check_network_convergence(transformer, use_cuda=False)
+        self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 55c6e54906e739ef0bc953fa5c9e9641ec575ccf..910d9538b009496813f40b82d62eb2b12964a99f 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def as_lodtensor(np_array, lod, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_value, place)
-    if lod is not None:
-        tensor.set_recursive_sequence_lengths(lod)
-    return tensor
-
-
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def __set_input__(var_name, var):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
                 tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
             tensor._set_dims(var.shape)
-            tensor.set(var, place)
+            tensor.set(np_value_to_fluid_value(var), place)
         elif isinstance(var, float):
             scope.find_var(var_name).set_float(var)
         elif isinstance(var, int):
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
         if name not in np_list:
             assert var_proto.intermediate, "{} not found".format(name)
         else:
+            # inferece the dtype from numpy value.
             np_value = np_list[name]
             if isinstance(np_value, tuple):
                 dtype = np_value[0].dtype
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
                 if is_input:
                     shape = list(np_value.shape)
                     lod_level = 0
+        # NOTE(dzhwinter): type hacking
+        # numpy float16 is binded to paddle::platform::float16
+        # in tensor_py.h via the help of uint16 datatype. Because
+        # the internal memory representation of float16 is
+        # actually uint16_t in paddle. So we use np.uint16 in numpy for
+        # raw memory, it can pass through the pybind. So in the testcase,
+        # we feed data use data.view(uint16), but the dtype is float16 in fact.
+        # The data.view(uint16) means do not cast the data type, but process data as the uint16
+        if dtype == np.uint16:
+            dtype = np.float16
         return block.create_var(
             dtype=dtype, shape=shape, lod_level=lod_level, name=name)
 
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index c62792face3c353db1f2e3c77eaf4bd32fbded69..d0eb3fd3724899aad39422983fd3cd0d00ff2a2d 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -403,7 +403,7 @@ def transformer(
         trg_pad_idx,
         pos_pad_idx, ):
     file_obj = fluid.layers.open_recordio_file(
-        filename='./wmt16.recordio',
+        filename='/tmp/wmt16.recordio',
         shapes=[
             [batch_size * max_length, 1],
             [batch_size * max_length, 1],