Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_py_reader

ea8a375f · fengjiayi · 060f4217 · 7c85a977 · ea8a375f · ea8a375f
47 changed file
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
+#!/bin/bash
 set -e
 function train() {

--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
+#!/bin/bash
 set -e
 function clock_to_seconds() {

--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
+#!/bin/bash
 set -e
 function train() {

--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
+#!/bin/bash
 set -e
 function clock_to_seconds() {

--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
+#!/bin/bash
 set -e
 function train() {

--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
+#!/bin/bash
 set -e
 function train() {

--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
+#!/bin/bash
 set -e
 function test() {

--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
+#!/bin/bash
 set -e
 function test() {

--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
+#!/bin/bash
 set -e
 function test() {

--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
+#!/bin/bash
 set -e
 function test() {

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -22,8 +22,6 @@
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 namespace paddle {
-namespace inference {
-namespace analysis {
 DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
            "Enable subgraph to TensorRT engine for acceleration");
@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
              "Graphviz debuger for data flow graphs.");
+namespace inference {
+namespace analysis {
 class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -45,14 +45,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 namespace paddle {
-namespace inference {
-namespace analysis {
 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
+namespace inference {
+namespace analysis {
 class Analyzer : public OrderedRegistry<PassManager> {
 public:
  // Register all the pass-managers.

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,13 +13,21 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-TEST_F(DFG_Tester, main) {
+TEST_F(DFG_Tester, analysis_without_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+TEST_F(DFG_Tester, analysis_with_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
  Analyzer analyser;
  analyser.Run(&argument);
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
  return stack_.top();
 }
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
 GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
    const std::vector<Node *> &source) {
  PADDLE_ENFORCE(!source.empty(),
                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
  std::unordered_set<Node *> visited;
  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
  while (!to_visit.empty()) {
    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
    for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
      inlink_visited.clear();
      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
  return sorted_[cursor_];
 }
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {
   private:
    std::vector<Node *> sorted_;
-    int cursor_{0};
+    size_t cursor_{0};
  };
  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
+std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -22,14 +22,18 @@
 namespace paddle {
 namespace inference {
+DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
 namespace analysis {
 using framework::proto::ProgramDesc;
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);
-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
  PADDLE_ENFORCE(!argument->transformed_program_desc);
@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
+  LOG(INFO) << "graph.inputs " << graph->inputs.size();
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
-    if (it->deleted()) continue;
+    if (node.deleted()) continue;
-    switch (it->type()) {
+    switch (node.type()) {
      case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
+        LOG(INFO) << "add function " << node.repr();
-        AddFluidOp(&(*it));
+        AddFluidOp(&node);
      } break;
      case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
+        LOG(INFO) << "add engine op " << node.repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
+                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
-        AddEngineOp(&(*it));
+        AddEngineOp(&node);
      } break;
      default:
        continue;
    }
  }
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }
-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
  // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *op = main_block->add_ops();
  *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
+  // by analysis phrase.
  // The inputs and outputs of the existing ops are not changed by tensorrt
  // subgraph pass.
  // NOTE It might be changed by other passes in the long run.
 }
-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
-                       const framework::proto::BlockDesc& block) {
+                       const framework::proto::BlockDesc &block) {
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);
  // collect inputs
  std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
+  for (auto *x : func->inlinks) {
    io.push_back(x->name());
  }
  desc.SetInput("Xs", io);
  // collect outputs
  io.clear();
-  for (auto* x : func->outlinks) {
+  for (auto *x : func->outlinks) {
    io.push_back(x->name());
  }
  desc.SetOutput("Ys", io);
  desc.SetType("tensorrt_engine");
+  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
  // Set attrs
  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
-          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
  std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
    if (!node->IsValue()) continue;
    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
    framework::proto::VarDesc var;
@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
  return parameters;
 }
-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  // TODO(Superjomn) Here need to expose some arguments for default setting.
  PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
  framework::proto::BlockDesc proto;
  framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "origin variable size: "
+            << argument_->origin_program_desc->blocks(0).vars().size();
+  LOG(INFO) << "transformed variable size: "
+            << block_desc.Proto()->vars().size();
  // copy ops.
-  for (auto* node : block_node->subgraph) {
+  for (auto *node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+    auto *op = block_desc.AppendOp();
    PADDLE_ENFORCE(!node->pb_msg().empty());
    op->Proto()->ParseFromString(node->pb_msg());
  }
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
  op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +165,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 public:
  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
      : DFG_GraphvizDrawPass(config) {}
  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_inference_analysis_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -26,6 +26,10 @@
 namespace paddle {
 namespace inference {
+DECLARE_int32(tensorrt_max_batchsize);
+DECLARE_int32(tensorrt_workspace_size);
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 public:

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
    no++;
  }
  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
  PADDLE_ENFORCE(argument);
  if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
    argument->main_dfg.reset(new DataFlowGraph);
  }
  desc_ = argument->origin_program_desc.get();
@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    v->SetPbMsg(var.SerializeAsString());
    var2id[var.name()] = v->id();
  }
  for (int i = 0; i < main_block.ops_size(); i++) {
    const auto &op = main_block.ops(i);
    auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    o->SetPbMsg(op.SerializeAsString());
    // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    std::unordered_set<Node *> inlinks;
    for (int j = 0; j < op.inputs_size(); j++) {
      auto &in_var = op.inputs(j);
      for (int k = 0; k < in_var.arguments_size(); k++) {
        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
        in->outlinks.push_back(o);
        o->inlinks.push_back(in);
+        inlinks.insert(in);
      }
    }
    for (int j = 0; j < op.outputs_size(); j++) {
      auto &out_var = op.outputs(j);
      for (int k = 0; k < out_var.arguments_size(); k++) {
        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (inlinks.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
        out->inlinks.push_back(o);
        o->outlinks.push_back(out);
      }

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -24,12 +24,12 @@ namespace analysis {
 TEST_F(DFG_Tester, Init) {
  FluidToDataFlowGraphPass pass;
  pass.Initialize(&argument);
-  DataFlowGraph graph;
+  pass.Run(argument.main_dfg.get());
-  pass.Run(&graph);
  // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37UL);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
  pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs.empty());
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -82,7 +82,7 @@ inference_api_test(test_api_impl
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
 inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
 endif()

--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
 bool PaddleInferenceAnakinPredictor::Run(
    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
+    std::vector<PaddleTensor> *output_data, int batch_size) {
  for (const auto &input : inputs) {
    if (input.dtype != PaddleDType::FLOAT32) {
      LOG(ERROR) << "Only support float type inputs. " << input.name

--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
  std::unique_ptr<PaddlePredictor> Clone() override;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 }
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
  VLOG(3) << "Predictor::predict";
  Timer timer;
  timer.tic();

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
  bool Init(std::shared_ptr<framework::Scope> parent_scope);
  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
  std::unique_ptr<PaddlePredictor> Clone() override;

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 namespace paddle {
@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
      return false;
    }
-    // Analyze inference_program
+    OptimizeInferenceProgram();
-    Argument argument;
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
    ctx_ = executor_->Prepare(*inference_program_, 0);
    VLOG(5) << "to create variables";
@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
    return true;
  }
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+  void OptimizeInferenceProgram() {
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
+  }
 private:
  TensorRTConfig config_;
 };

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -98,7 +98,8 @@ class PaddlePredictor {
  // responsible for the output tensor's buffer, either allocated or passed from
  // outside.
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.

--- a/paddle/fluid/inference/api/test_api.cc
+++ b/paddle/fluid/inference/api/test_api.cc
@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
    LOG(INFO) << "I get other_config " << config.other_config;
  }
  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
    LOG(INFO) << "Run";
    return false;
  }

--- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
@@ -15,50 +15,79 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 namespace paddle {
 DEFINE_string(dirname, "", "Directory of the inference model.");
-void Main(bool use_gpu) {
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
  //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
+  NativeConfig config0;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
+  config0.use_gpu = true;
-  config.fraction_of_gpu_memory = 0.15;
+  config0.fraction_of_gpu_memory = 0.3;
-  config.device = 0;
+  config0.device = 0;
-  auto predictor =
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
      CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
    //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;
-    PaddleTensor tensor{.name = "",
+    PaddleTensor tensor{
-                        .shape = std::vector<int>({4, 1}),
+        .name = "",
-                        .data = PaddleBuf(data, sizeof(data)),
+        .shape = std::vector<int>({10, 1}),
-                        .dtype = PaddleDType::INT64};
+        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
+        .dtype = PaddleDType::INT64};
    // For simplicity, we set all the slots with the same data.
    std::vector<PaddleTensor> slots(4, tensor);
    //# 3. Run
-    std::vector<PaddleTensor> outputs;
+    std::vector<PaddleTensor> outputs0;
-    CHECK(predictor->Run(slots, &outputs));
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));
    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
+    ASSERT_EQ(outputs0.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    ASSERT_EQ(outputs1.size(), 1UL);
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    EXPECT_EQ(num_elements, num_elements1);
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
    }
  }
 }
-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -93,6 +93,10 @@ class OpConverter {
  framework::Scope* scope_{nullptr};
 };
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
    trt_##op_type__##_converter() {                                            \
@@ -111,7 +115,3 @@ class OpConverter {
  extern int TouchConverterRegister_##op_type__();                      \
  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
      TouchConverterRegister_##op_type__();
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -26,18 +26,20 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
-void TensorRTEngine::Build(const DescType& paddle_model) {
+void TensorRTEngine::Build(const DescType &paddle_model) {
  PADDLE_ENFORCE(false, "not implemented");
 }
 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
+  batch_size_ = batch_size;
-  for (auto& buf : buffers_) {
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
    PADDLE_ENFORCE_GT(buf.max_size, 0);
    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
    buffers.push_back(buf.buffer);
  }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
  cudaStreamSynchronize(*stream_);
 }
@@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) {
 TensorRTEngine::~TensorRTEngine() {
  cudaStreamSynchronize(*stream_);
  // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
@@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() {
  // allocate GPU buffers.
  buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
    if (item.second == 0) {
      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
      auto dims = infer_engine_->getBindingDimensions(slot_offset);
      item.second = kDataTypeSize[static_cast<int>(
                        infer_engine_->getBindingDataType(slot_offset))] *
                    analysis::AccuDims(dims.d, dims.nbDims);
+      PADDLE_ENFORCE_GT(item.second, 0);
    }
-    auto& buf = buffer(item.first);
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
-            << buf.buffer;
+    // buf.size will changed in the runtime.
-    buf.size = buf.max_size = item.second;
+    buf.size = 0;
    buf.device = DeviceType::GPU;
  }
 }
-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                    name);
  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                        analysis::AccuDims(dims.d, dims.nbDims);
@@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
  return input;
 }
-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
-                                   const std::string& name) {
+                                   const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);
-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
@@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
  buffer_sizes_[name] = 0;
 }
-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  PADDLE_ENFORCE(!output->isNetworkInput());
@@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
  buffer_sizes_[name] = 0;
 }
-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
  return buffer(name).buffer;
 }
-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                    size_t max_size) {
  // determine data size
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
                                    cudaMemcpyDeviceToDevice, *stream_),
                    0);
 }
-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                    size_t max_size) {
+  VLOG(4) << "get output in cpu";
+  auto &buf = buffer(name);
+  // Update needed buffer size.
+  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
+  auto dims = infer_engine_->getBindingDimensions(slot_offset);
+  buf.size = kDataTypeSize[static_cast<int>(
+                 infer_engine_->getBindingDataType(slot_offset))] *
+             analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE_LE(buf.size, buf.max_size);
  // determine data size
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
+  // DEBUG
-                                       cudaMemcpyDeviceToHost, *stream_));
+  memset(dst, 0, buf.size);
+  PADDLE_ENFORCE_EQ(
+      0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
 }
-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
  return buffers_[slot_offset];
 }
-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                     size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyHostToDevice, *stream_));
 }
-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                     size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                       cudaMemcpyDeviceToDevice, *stream_));
 }
-void TensorRTEngine::SetITensor(const std::string& name,
+void TensorRTEngine::SetITensor(const std::string &name,
-                                nvinfer1::ITensor* tensor) {
+                                nvinfer1::ITensor *tensor) {
  PADDLE_ENFORCE(tensor != nullptr);
  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                    name);
  itensor_map_[name] = tensor;
 }
-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
  return itensor_map_[name];
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
        stream_(stream ? stream : &default_stream_),
-        logger_(logger) {}
+        logger_(logger) {
+    cudaStreamCreate(&default_stream_);
+  }
  virtual ~TensorRTEngine();
@@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase {
  int max_batch_;
  // the max memory size the engine uses
  int max_workspace_;
+  // batch size of the current data, will be updated each Executation.
+  int batch_size_{-1};
  cudaStream_t* stream_;
  // If stream_ is not set from outside, hold its own stream.
  cudaStream_t default_stream_;

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
  LOG(INFO) << "to get output";
  float y_cpu[2] = {-1., -1.};
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 2);
+  ASSERT_EQ(dims.d[1], 1);
  engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
  ASSERT_EQ(y_cpu[0], 4.5);
  ASSERT_EQ(y_cpu[1], 14.5);

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -168,6 +168,8 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(relu);\n")
      elseif(${TARGET} STREQUAL "fake_dequantize")
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()
@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      DEPS tensorrt_engine_op
      analysis)
 else()
    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)

--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -98,7 +98,7 @@ The update equations are as follows:
 $$
 velocity = mu * velocity + gradient \\
 if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+  param = param - (gradient + mu * velocity) * learning\_rate \\
 else:   \\
  param = param - learning\_rate * velocity. \\
 $$

--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
      T g_val = g[i];
      T v_new = v[i] * mu + g_val;
      v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
    }
  } else {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;

--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
    v_out = v * mu + g;
    if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
+      p_out = p - (g + v_out * mu) * lr[0];
    } else {
      p_out = p - lr[0] * v_out;
    }

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 namespace paddle {
+DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
 namespace operators {
 using inference::Singleton;
@@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                    "TensorRT' tensor input requires at least 2 dimensions");
  PADDLE_ENFORCE_LE(shape.size(), 4UL,
                    "TensorRT' tensor input requires at most 4 dimensions");
  switch (shape.size()) {
    case 2:
      return nvinfer1::Dims2(shape[0], shape[1]);
@@ -90,27 +92,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
  engine->InitNetwork();
  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  VLOG(4) << "parsed var size " << block.AllVars().size();
  // Add inputs
  VLOG(4) << "declare inputs";
  for (auto &input : context.Inputs("Xs")) {
    VLOG(4) << "declare input " << input;
    auto *var = block.FindVar(input);
+    // TensorRT engine need to create parameters. The parameter's description
+    // should be set in
+    PADDLE_ENFORCE(var, "no variable called %s", input);
    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                      "TensorRT engine only takes LoDTensor as input");
    auto shape = var->GetShape();
+    // For the special batch_size placeholder -1, drop it and pass the real
+    // shape of data.
+    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+    // variational batch size.
+    if (shape[0] == -1) {
+      shape[0] = FLAGS_tensorrt_engine_batch_size;
+    }
    engine->DeclareInput(
        input, FluidDataType2TRT(
                   var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(var->GetShape()));
+        Vec2TRT_Dims(shape));
  }
  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
      block_desc, parameters, context.scope(), engine);
  // Add outputs
-  VLOG(4) << "declare outputs";
  for (auto &output : context.Outputs("Ys")) {
-    VLOG(4) << "declare output " << output;
    engine->DeclareOutput(output);
  }
@@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL(
    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
+// A trick to compile with the needed TensorRT op converter.
+USE_TRT_CONVERTER(mul)
 #endif  // PADDLE_WITH_CUDA
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -24,6 +24,9 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 namespace paddle {
+DECLARE_int32(tensorrt_engine_batch_size);
 namespace operators {
 using inference::Singleton;
@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "TensorRTEngineKernel executing";
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    // Try to determine a batch_size
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
+                      context.Attr<int>("max_batch"));
-        context.scope(), input_names.front());
-    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      }
    }
    // Execute the engine.
-    PADDLE_ENFORCE_GT(batch_size, 0);
+    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
-    engine->Execute(batch_size);
+    engine->Execute(FLAGS_tensorrt_engine_batch_size);
    // Convert output tensor from engine to fluid
    for (const auto& y : context.Outputs("Ys")) {
      // convert output and copy to fluid.
@@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      auto* fluid_v = context.scope().FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      fluid_t->Resize(framework::make_ddim(ddim));
      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
-      if (platform::is_cpu_place(fluid_t->place())) {
+      fluid_t->Resize(framework::make_ddim(ddim));
-        // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInCPU(
+      // TODO(Superjomn) find some way to determine which device to output the
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
+      // tensor.
-            size * sizeof(float));
+      // if (platform::is_cpu_place(fluid_t->place())) {
-      } else {
+      // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInGPU(
+      engine->GetOutputInCPU(y,
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+                             fluid_t->mutable_data<float>(platform::CPUPlace()),
-            size * sizeof(float));
+                             size * sizeof(float));
-      }
+      //} else {
+      // engine->GetOutputInGPU(
+      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+      // size * sizeof(float));
+      //}
    }
    cudaStreamSynchronize(*engine->stream());

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -333,8 +333,7 @@ function assert_api_not_changed() {
    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
    deactivate
-    # Use git diff --name-only HEAD^ may not get file changes for update commits in one PR
+    API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
-    API_CHANGE=`echo $CHANGED_FILES | grep "paddle/fluid/API.spec" || true`
    echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
    if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
        # TODO: curl -H 'Authorization: token ${TOKEN}'

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -166,7 +166,8 @@ def fc(input,
        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
            parameters/weights of this layer.
        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
        act (str, default None): Activation to be applied to the output of this layer.
        is_test(bool): A flag indicating whether execution is in test phase.
        use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer):
        & if (use\_nesterov):
-        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
        & else:

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -48,6 +48,7 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -60,3 +61,4 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import sys
+import signal
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        return out
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+        short = self.shortcut(input, num_filters * 2, stride)
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu')
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid')
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+def get_model(batch_size):
+    # Input data
+    image = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+    # Train program
+    model = SE_ResNeXt(layers=50)
+    out = model.net(input=image, class_dim=102)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    # Evaluator
+    test_program = fluid.default_main_program().clone(for_test=True)
+    # Optimization
+    total_images = 6149  # flowers
+    epochs = [30, 60, 90]
+    step = int(total_images / batch_size + 1)
+    bd = [step * e for e in epochs]
+    base_lr = 0.1
+    lr = []
+    lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    optimizer.minimize(avg_cost)
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test(), batch_size=batch_size)
+    return test_program, avg_cost, train_reader, test_reader, acc_top1, out
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+class DistSeResneXt2x2:
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model(batch_size=2)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            num_trainers=trainers,
+            trainer_id=trainer_id)
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            loss, = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+    model = DistSeResneXt2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import argparse
+import time
+import math
+import unittest
+import os
+import signal
+import subprocess
+class TestDistSeResneXt2x2(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+    def start_pserver(self):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+    def non_test_with_place(self):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
+        local_proc.wait()
+        local_ret = local_proc.stdout.read()
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver()
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
+        tr0_proc.wait()
+        tr1_proc.wait()
+        loss_data0 = tr0_proc.stdout.read()
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+        self.assertAlmostEqual(local_first_loss, dist_first_loss)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss)
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):
        velocity_out = mu * velocity + grad
        if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                        velocity_out * mu * learning_rate
        else:
            param_out = param - learning_rate * velocity_out
@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):
        velocity_out = mu * velocity + grad
        if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                        velocity_out * mu * learning_rate
        else:
            param_out = param - learning_rate * velocity_out