diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index a4625f008c15300b88ef0bce71cd7d8aa473c9a8..b3a1075e5adf4a24bf32017574c061f36c46ba8c 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -22,8 +22,6 @@
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
 
 DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
             "Enable subgraph to TensorRT engine for acceleration");
@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+namespace inference {
+namespace analysis {
+
 class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index e9e14fb1947da059c8d126d3da182ce446f6421e..0132bf5b9c6552391aaa19542669487f42b685a7 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -45,14 +45,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
 
 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
 
+namespace inference {
+namespace analysis {
+
 class Analyzer : public OrderedRegistry<PassManager> {
  public:
   // Register all the pass-managers.
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index d7c1a72932a39f878add2bb884e280b91d3c38c0..25a440e7e71fddb38cc515f99d15231675a8172e 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,13 +13,21 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, main) {
+TEST_F(DFG_Tester, analysis_without_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+TEST_F(DFG_Tester, analysis_with_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index bd24e8a7d9c20b8cd9c4e41a76ffc33a004a9a69..8a3af0a8ebd5bad7be7046fa399cca4920da3d71 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
   return stack_.top();
 }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
+
 GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
     const std::vector<Node *> &source) {
   PADDLE_ENFORCE(!source.empty(),
                  "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
   std::unordered_set<Node *> visited;
   std::unordered_set<Node *> to_visit{source.begin(), source.end()};
 
@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
   while (!to_visit.empty()) {
     std::vector<Node *> queue(to_visit.begin(), to_visit.end());
     for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
       inlink_visited.clear();
 
       std::copy_if(p->inlinks.begin(), p->inlinks.end(),
@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
   return sorted_[cursor_];
 }
 
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 5dd914d1971bfb5bcc0b1db41d73e2b67120bc06..1c60d5de21538043962cc58a6f508aea635fe8c4 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {
 
    private:
     std::vector<Node *> sorted_;
-    int cursor_{0};
+    size_t cursor_{0};
   };
 
   explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 29ca008123addf07959b965a4b54bf55b18c401d..2328d870422c5a31c22d7b09980aae35e01b2b25 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -22,14 +22,18 @@
 
 namespace paddle {
 namespace inference {
+
+DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
+
 namespace analysis {
 
 using framework::proto::ProgramDesc;
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);
 
-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
   PADDLE_ENFORCE(!argument->transformed_program_desc);
@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
 
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
-    if (it->deleted()) continue;
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  LOG(INFO) << "graph.inputs " << graph->inputs.size();
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.deleted()) continue;
 
-    switch (it->type()) {
+    switch (node.type()) {
       case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
-        AddFluidOp(&(*it));
+        LOG(INFO) << "add function " << node.repr();
+        AddFluidOp(&node);
       } break;
       case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
-        AddEngineOp(&(*it));
+        LOG(INFO) << "add engine op " << node.repr() << " , "
+                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
+        AddEngineOp(&node);
       } break;
       default:
         continue;
     }
   }
+
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }
 
-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
+  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
   // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
+  // by analysis phrase.
   // The inputs and outputs of the existing ops are not changed by tensorrt
   // subgraph pass.
   // NOTE It might be changed by other passes in the long run.
 }
 
-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
-                       const framework::proto::BlockDesc& block) {
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+                       const framework::proto::BlockDesc &block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
   std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
+  for (auto *x : func->inlinks) {
     io.push_back(x->name());
   }
   desc.SetInput("Xs", io);
 
   // collect outputs
   io.clear();
-  for (auto* x : func->outlinks) {
+  for (auto *x : func->outlinks) {
     io.push_back(x->name());
   }
   desc.SetOutput("Ys", io);
-
   desc.SetType("tensorrt_engine");
+
+  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
   // Set attrs
   SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
-          "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
+  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
   std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
     if (!node->IsValue()) continue;
     PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
     framework::proto::VarDesc var;
@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
   return parameters;
 }
 
-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   // TODO(Superjomn) Here need to expose some arguments for default setting.
   PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
   framework::proto::BlockDesc proto;
   framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "origin variable size: "
+            << argument_->origin_program_desc->blocks(0).vars().size();
+  LOG(INFO) << "transformed variable size: "
+            << block_desc.Proto()->vars().size();
   // copy ops.
-  for (auto* node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+  for (auto *node : block_node->subgraph) {
+    auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
   CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
   op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +165,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
 
-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_inference_analysis_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index edc84b02ed20991e3e7c6c437d2b1fac169bae03..59c47365aa6c8ad5886c4515850d264f69cc4670 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -26,6 +26,10 @@
 
 namespace paddle {
 namespace inference {
+
+DECLARE_int32(tensorrt_max_batchsize);
+DECLARE_int32(tensorrt_workspace_size);
+
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 162455b9c4e06b7fbb4bdede30444faf6a8a1509..65842b1e850953e77e3d4d28416609be271af9f1 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index e918622d74cfb11d83090555be2a768cc14e7742..496921db9eabce1b1e40c7cb13089446ca93321c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
   PADDLE_ENFORCE(argument);
   if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
     argument->main_dfg.reset(new DataFlowGraph);
   }
   desc_ = argument->origin_program_desc.get();
@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
+
   for (int i = 0; i < main_block.ops_size(); i++) {
     const auto &op = main_block.ops(i);
     auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     o->SetPbMsg(op.SerializeAsString());
 
     // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    std::unordered_set<Node *> inlinks;
     for (int j = 0; j < op.inputs_size(); j++) {
       auto &in_var = op.inputs(j);
       for (int k = 0; k < in_var.arguments_size(); k++) {
         auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
         in->outlinks.push_back(o);
         o->inlinks.push_back(in);
+        inlinks.insert(in);
       }
     }
     for (int j = 0; j < op.outputs_size(); j++) {
       auto &out_var = op.outputs(j);
       for (int k = 0; k < out_var.arguments_size(); k++) {
         auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (inlinks.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
         out->inlinks.push_back(o);
         o->outlinks.push_back(out);
       }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index cbca5abdd5fff1672ba5d47a8876489c54ad6947..dadb84059d21adab44159a6145b345460663cb96 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -24,12 +24,12 @@ namespace analysis {
 TEST_F(DFG_Tester, Init) {
   FluidToDataFlowGraphPass pass;
   pass.Initialize(&argument);
-  DataFlowGraph graph;
-  pass.Run(&graph);
+  pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37UL);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
   pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs.empty());
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index 9993de22800bc0aafdcbf46618e6b479ac1eb187..faf876de6d65d20cf7a084cd97392cfc8d791a42 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
   SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 9d63d08dedf6a1bcdacc51bb83d2ed261bca4117..e28e144fd54cec06b0228ac9c478de7c641455a0 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -82,7 +82,7 @@ inference_api_test(test_api_impl
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
         SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
 
 inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
 endif()
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index f6f3cb335897b02905e24c229b92f3940a37dbf8..0206ac60103759deda91be741617bde63e003de6 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
 
 bool PaddleInferenceAnakinPredictor::Run(
     const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
+    std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
     if (input.dtype != PaddleDType::FLOAT32) {
       LOG(ERROR) << "Only support float type inputs. " << input.name
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 85ca83cd00756cca04d7b92437e9955d8ab297e7..def096c867ec85624f5b221782ef8b6240923c05 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
   bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 786dc8e827806a9cea9dc01788fada2fd754b930..3ae255e13fc4f3ca28a6af62a5d5944d84303fc7 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 }
 
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
   VLOG(3) << "Predictor::predict";
   Timer timer;
   timer.tic();
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 92e693578ab657004f3c40c09b979897afea1e1f..4f28c3cd34bade4189871210e6168c6c1c610c2c 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
   bool Init(std::shared_ptr<framework::Scope> parent_scope);
 
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 0cdc88fa1eaf3935ce0da143e1e91eb84cd70dcf..c0891e9c281961fa03d278a0f5c676f92672c419 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 
@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
       return false;
     }
 
-    // Analyze inference_program
-    Argument argument;
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
+    OptimizeInferenceProgram();
     ctx_ = executor_->Prepare(*inference_program_, 0);
 
     VLOG(5) << "to create variables";
@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
     return true;
   }
 
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram() {
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
+  }
+
  private:
   TensorRTConfig config_;
 };
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index b8ba2d14a5c161d491d838888ea14b776f769f23..2f8b4f8596946988a728b5cf82de251bfda778a9 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -98,7 +98,8 @@ class PaddlePredictor {
   // responsible for the output tensor's buffer, either allocated or passed from
   // outside.
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
 
   // Clone a predictor that share the model weights, the Cloned predictor should
   // be thread-safe.
diff --git a/paddle/fluid/inference/api/test_api.cc b/paddle/fluid/inference/api/test_api.cc
index ac8a21a22be6f27311b8ae2507d04d9d1b510e76..7a579610eefda24c911edd28b5f3a178aa10ab1e 100644
--- a/paddle/fluid/inference/api/test_api.cc
+++ b/paddle/fluid/inference/api/test_api.cc
@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
     LOG(INFO) << "I get other_config " << config.other_config;
   }
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
     LOG(INFO) << "Run";
     return false;
   }
diff --git a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
index 585f6d29376c3341c21ff76361d5335512c1b1b6..62d98a796708612e7d4ff8abfd85125978ce22c7 100644
--- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
@@ -15,50 +15,79 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
-void Main(bool use_gpu) {
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+
   //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor =
+  NativeConfig config0;
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.use_gpu = true;
+  config0.fraction_of_gpu_memory = 0.3;
+  config0.device = 0;
+
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
       CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
 
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
     //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor{
+        .name = "",
+        .shape = std::vector<int>({10, 1}),
+        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
+        .dtype = PaddleDType::INT64};
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
 
     //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
+    std::vector<PaddleTensor> outputs0;
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));
 
     //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    ASSERT_EQ(outputs0.size(), 1UL);
+    ASSERT_EQ(outputs1.size(), 1UL);
+
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+    EXPECT_EQ(num_elements, num_elements1);
+
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
     }
   }
 }
 
-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 6697952051c4b1997ca6b550da17a52e64cb3454..968f7eb99ce8519edaa585fd3cb642bd80cc63cc 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -93,6 +93,10 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
   struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
     trt_##op_type__##_converter() {                                            \
@@ -111,7 +115,3 @@ class OpConverter {
   extern int TouchConverterRegister_##op_type__();                      \
   static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
       TouchConverterRegister_##op_type__();
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a..fefec0df6d03669a294ce9643b666d7416593708 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -26,18 +26,20 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void TensorRTEngine::Build(const DescType& paddle_model) {
+void TensorRTEngine::Build(const DescType &paddle_model) {
   PADDLE_ENFORCE(false, "not implemented");
 }
 
 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
-  for (auto& buf : buffers_) {
+  batch_size_ = batch_size;
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
     PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
     PADDLE_ENFORCE_GT(buf.max_size, 0);
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
   cudaStreamSynchronize(*stream_);
 }
@@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) {
 TensorRTEngine::~TensorRTEngine() {
   cudaStreamSynchronize(*stream_);
   // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
       PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
       buf.buffer = nullptr;
@@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() {
 
   // allocate GPU buffers.
   buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
       auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
                     analysis::AccuDims(dims.d, dims.nbDims);
+      PADDLE_ENFORCE_GT(item.second, 0);
     }
-    auto& buf = buffer(item.first);
+
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
-            << buf.buffer;
-    buf.size = buf.max_size = item.second;
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
+    // buf.size will changed in the runtime.
+    buf.size = 0;
     buf.device = DeviceType::GPU;
   }
 }
 
-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
   buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                         analysis::AccuDims(dims.d, dims.nbDims);
@@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
   return input;
 }
 
-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                                   const std::string& name) {
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
+                                   const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
   SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
@@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
   buffer_sizes_[name] = 0;
 }
 
-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   PADDLE_ENFORCE(!output->isNetworkInput());
@@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   buffer_sizes_[name] = 0;
 }
 
-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
   return buffer(name).buffer;
 }
 
-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                     size_t max_size) {
   // determine data size
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
   PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
                                     cudaMemcpyDeviceToDevice, *stream_),
                     0);
 }
 
-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                     size_t max_size) {
+  VLOG(4) << "get output in cpu";
+  auto &buf = buffer(name);
+
+  // Update needed buffer size.
+  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
+  auto dims = infer_engine_->getBindingDimensions(slot_offset);
+  buf.size = kDataTypeSize[static_cast<int>(
+                 infer_engine_->getBindingDataType(slot_offset))] *
+             analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE_LE(buf.size, buf.max_size);
   // determine data size
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
-                                       cudaMemcpyDeviceToHost, *stream_));
+  // DEBUG
+  memset(dst, 0, buf.size);
+  PADDLE_ENFORCE_EQ(
+      0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
 }
 
-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
   PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                        cudaMemcpyHostToDevice, *stream_));
 }
 
-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                        cudaMemcpyDeviceToDevice, *stream_));
 }
 
-void TensorRTEngine::SetITensor(const std::string& name,
-                                nvinfer1::ITensor* tensor) {
+void TensorRTEngine::SetITensor(const std::string &name,
+                                nvinfer1::ITensor *tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
   PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
   PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0..7064d333f6db754f88c0ac6956a9527a48bf866c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
         stream_(stream ? stream : &default_stream_),
-        logger_(logger) {}
+        logger_(logger) {
+    cudaStreamCreate(&default_stream_);
+  }
 
   virtual ~TensorRTEngine();
 
@@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase {
   int max_batch_;
   // the max memory size the engine uses
   int max_workspace_;
+
+  // batch size of the current data, will be updated each Executation.
+  int batch_size_{-1};
   cudaStream_t* stream_;
   // If stream_ is not set from outside, hold its own stream.
   cudaStream_t default_stream_;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index e635f0f87d577a1f1ac74687ee60f762be525418..fca3488008ed83418b5e28b8af42d8019aaaa2a4 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
 
   LOG(INFO) << "to get output";
   float y_cpu[2] = {-1., -1.};
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 2);
+  ASSERT_EQ(dims.d[1], 1);
   engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
   ASSERT_EQ(y_cpu[0], 4.5);
   ASSERT_EQ(y_cpu[1], 14.5);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d265150f25419509126028e36e629aee3ee6bd0f..4e2002ad24415437ae4f85eba0e90a6c689e2996 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -168,6 +168,8 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      DEPS tensorrt_engine_op
       analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index dcd73e3c3e40f80e07b73944d1f0cc57fea010d3..5f43c5810812260c4384349bdb709716c9a182f5 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -98,7 +98,7 @@ The update equations are as follows:
 $$
 velocity = mu * velocity + gradient \\
 if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+  param = param - (gradient + mu * velocity) * learning\_rate \\
 else:   \\
   param = param - learning\_rate * velocity. \\
 $$
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index 5eb9d9950248bb50bb823f071c7fff0ddcc47234..a3932db1f3a50305d585cd3d5e86fa1b527df78b 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
       T g_val = g[i];
       T v_new = v[i] * mu + g_val;
       v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
     }
   } else {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 04a1929b84a93af6465bacfe7974a1530296946d..264726040fb566a52b8c0cdee0a1524197d2a675 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
 
     v_out = v * mu + g;
     if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
+      p_out = p - (g + v_out * mu) * lr[0];
     } else {
       p_out = p - lr[0] * v_out;
     }
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 647cfc0a0af2be85e2868c6f68cab962c6631a8d..43672d6db92a981f0fbe6e8f7079dafc6ae4052e 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
+
+DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+
 namespace operators {
 
 using inference::Singleton;
@@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
                     "TensorRT' tensor input requires at most 4 dimensions");
-
   switch (shape.size()) {
     case 2:
       return nvinfer1::Dims2(shape[0], shape[1]);
@@ -90,27 +92,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
   engine->InitNetwork();
 
   framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  VLOG(4) << "parsed var size " << block.AllVars().size();
   // Add inputs
   VLOG(4) << "declare inputs";
   for (auto &input : context.Inputs("Xs")) {
     VLOG(4) << "declare input " << input;
     auto *var = block.FindVar(input);
+    // TensorRT engine need to create parameters. The parameter's description
+    // should be set in
+    PADDLE_ENFORCE(var, "no variable called %s", input);
     PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                       "TensorRT engine only takes LoDTensor as input");
     auto shape = var->GetShape();
+    // For the special batch_size placeholder -1, drop it and pass the real
+    // shape of data.
+    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+    // variational batch size.
+    if (shape[0] == -1) {
+      shape[0] = FLAGS_tensorrt_engine_batch_size;
+    }
     engine->DeclareInput(
         input, FluidDataType2TRT(
                    var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(var->GetShape()));
+        Vec2TRT_Dims(shape));
   }
 
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
       block_desc, parameters, context.scope(), engine);
 
   // Add outputs
-  VLOG(4) << "declare outputs";
   for (auto &output : context.Outputs("Ys")) {
-    VLOG(4) << "declare output " << output;
     engine->DeclareOutput(output);
   }
 
@@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
+// A trick to compile with the needed TensorRT op converter.
+USE_TRT_CONVERTER(mul)
+
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 1602a913aeebe43fabe2f9c9036edd18ac4c70fd..a332d70030ffa6a033f6b2b33487a4fd279b7016 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -24,6 +24,9 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
+
+DECLARE_int32(tensorrt_engine_batch_size);
+
 namespace operators {
 
 using inference::Singleton;
@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    // Try to determine a batch_size
-    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
-        context.scope(), input_names.front());
-    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
+                      context.Attr<int>("max_batch"));
 
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       }
     }
     // Execute the engine.
-    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine->Execute(batch_size);
+    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
+    engine->Execute(FLAGS_tensorrt_engine_batch_size);
     // Convert output tensor from engine to fluid
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
@@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto* fluid_v = context.scope().FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      fluid_t->Resize(framework::make_ddim(ddim));
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
-      if (platform::is_cpu_place(fluid_t->place())) {
-        // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
-            size * sizeof(float));
-      } else {
-        engine->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-            size * sizeof(float));
-      }
+      fluid_t->Resize(framework::make_ddim(ddim));
+
+      // TODO(Superjomn) find some way to determine which device to output the
+      // tensor.
+      // if (platform::is_cpu_place(fluid_t->place())) {
+      // TODO(Superjomn) change this float to dtype size.
+      engine->GetOutputInCPU(y,
+                             fluid_t->mutable_data<float>(platform::CPUPlace()),
+                             size * sizeof(float));
+      //} else {
+      // engine->GetOutputInGPU(
+      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+      // size * sizeof(float));
+      //}
     }
 
     cudaStreamSynchronize(*engine->stream());
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 65e7b10625583327c2919efd45ef1fe01fa9eb3c..9e58a39eb0939fa15e9c19e1e6fc89a6f99d9a0c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -333,8 +333,7 @@ function assert_api_not_changed() {
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate
 
-    # Use git diff --name-only HEAD^ may not get file changes for update commits in one PR
-    API_CHANGE=`echo $CHANGED_FILES | grep "paddle/fluid/API.spec" || true`
+    API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
     echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
     if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
         # TODO: curl -H 'Authorization: token ${TOKEN}'
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56124663929d1e33b7144ab57ae3b3c55e1652b3..ab40d0c217f565493b30d9a4cb3a600863122bc7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -166,7 +166,8 @@ def fc(input,
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
         is_test(bool): A flag indicating whether execution is in test phase.
         use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7fc8e106fb43666be9c1ea245994dc1c7ac85d7d..3fe99f55011ab7f745c3ad98ec44dfe277a13e05 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer):
 
         & if (use\_nesterov):
 
-        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
 
         & else:
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8c564abf986d351b31e993b62bfe1f17c52a4e10..322d76515e76c3d322ac7c4f989bbc95875cb654 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -48,6 +48,7 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -60,3 +61,4 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..72bc1729b0f63b23ad7ecb5ad703b984a4c614ac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -0,0 +1,350 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import sys
+import signal
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu')
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid')
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def get_model(batch_size):
+    # Input data
+    image = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    # Train program
+    model = SE_ResNeXt(layers=50)
+    out = model.net(input=image, class_dim=102)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    # Evaluator
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    # Optimization
+    total_images = 6149  # flowers
+    epochs = [30, 60, 90]
+    step = int(total_images / batch_size + 1)
+
+    bd = [step * e for e in epochs]
+    base_lr = 0.1
+    lr = []
+    lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    optimizer.minimize(avg_cost)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test(), batch_size=batch_size)
+
+    return test_program, avg_cost, train_reader, test_reader, acc_top1, out
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistSeResneXt2x2:
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model(batch_size=2)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            num_trainers=trainers,
+            trainer_id=trainer_id)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            loss, = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+    model = DistSeResneXt2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3e7036f08cb88087ae45fe7d7c7565c102dab8a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import unittest
+import os
+import signal
+import subprocess
+
+
+class TestDistSeResneXt2x2(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def non_test_with_place(self):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
+        local_proc.wait()
+        local_ret = local_proc.stdout.read()
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver()
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        loss_data0 = tr0_proc.stdout.read()
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index aaea9c1809213c5707e8540eebbdd6f269836fdc..c75d3bd276a5b494090c1aa1fea0bb4f2c067173 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out