diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ab80987b3ad6c4793ceeac1bf3808d2e87fbd5b..231224f9249848b6e4981a98e0538794bf5d3c08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,12 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
+if(WITH_MKL)
+  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
+  if (MKL_SPLIT_GEMM)
+    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
+  endif()
+endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
     if (WITH_MKL AND AVX2_FOUND)
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8c0d87db7a353eaa40e3eb31b4f0181ea3d1d33f..69d665b80fde22c10d9d57687b0e45dae7291969 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -180,13 +180,13 @@ paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, default
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
-paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True))
+paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -209,9 +209,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
-paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e8057c35e8b957cb43e66937a5073a085c6e7708..df55b3d05402f1aeecfd8d4218a637a81d58ed87 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index c52980472de8d48e8c21e7c1e53813aa4847cece..f1f8674caf663ce38df5a2eecbcf690b5ca87dc4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -333,7 +333,7 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
-  return std::move(graph);
+  return graph;
 }
 
 bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d..e28264eb32756f77ef5baed3dff77ba9f0943160 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -35,14 +35,16 @@ struct ReduceLoDTensor {
     PADDLE_ENFORCE(!src_tensors_.empty());
     auto &t0 = *src_tensors_[0];
     PADDLE_ENFORCE_NE(t0.numel(), 0);
+
     dst_tensor_.Resize(t0.dims());
     T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    if (dst != t0.data<T>()) {
-      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-    }
 
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+    for (size_t i = 0; i < src_tensors_.size(); ++i) {
       auto &t = *src_tensors_[i];
+      if (dst == t.data<T>()) {
+        continue;
+      }
+
       PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
       PADDLE_ENFORCE_EQ(t.type(), t0.type());
       std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index f1080610381128325ea0affba760ac66798fd948..2e397e86825a41765a360d31fa8595d17027f3ec 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -31,7 +31,7 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
   std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
     auto new_graph = builder_->Apply(std::move(graph));
     PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
-    return std::move(new_graph);
+    return new_graph;
   }
 
   int GetVarDeviceID(const std::string& var_name) const override {
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index 411be02988a82b3e35d56833f92fc6fe405a2c3d..cd72162f44ca76aa6340606cf79a73601eae89af 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -53,7 +53,7 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
   std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
     auto new_graph = builder_->Apply(std::move(graph));
     printer_->Print(*new_graph, stream_ref_);
-    return std::move(new_graph);
+    return new_graph;
   }
 
   int GetVarDeviceID(const std::string& var_name) const override {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 38cde13fe279d264c51baff71cffcab7b6ebb227..f85c62dd6c4a8033a037b1e001ece6a9cc90ca98 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -171,7 +171,12 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
 
   for (size_t i = 0; i < fetch_tensors.size(); ++i) {
     auto &var_name = fetch_tensors[i];
-    auto &vars = fetched_vars.at(var_name);
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
 
     temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
     auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index d5ca74cb679b499da725a93b36384ca216ff4957..ee0604383ec9df826fa2abaef1f643ba0da6a096 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,5 +1,5 @@
-cc_library(graph SRCS graph.cc DEPS node)
 cc_library(node SRCS node.cc DEPS proto_desc)
+cc_library(graph SRCS graph.cc DEPS node)
 cc_library(pass SRCS pass.cc DEPS graph node)
 
 cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 688f7ba5825bf1a1ab65a0912663481913223e80..702baf932593e14ca4d070a655aab0188075f2e9 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -21,6 +21,7 @@ namespace framework {
 
 // NOTE(paddle-dev): This graph contains circle.
 Graph::Graph(const ProgramDesc &program) : program_(program) {
+  VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
     all_vars.emplace(var->Name(), var);
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f..919029c38f2f26a6f5e02da645c4f7718044cdae 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -312,19 +312,22 @@ void WriteToRecordIO(recordio::Writer *writer,
   writer->Write(buffer.str());
 }
 
-std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
-  std::vector<LoDTensor> result;
-  if (scanner->HasNext()) {
-    std::istringstream sin(scanner->Next());
-    uint32_t sz;
-    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
-    result.resize(sz);
-    for (uint32_t i = 0; i < sz; ++i) {
-      DeserializeFromStream(sin, &result[i], dev_ctx);
-    }
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  if (!scanner->HasNext()) {
+    return false;
   }
-  return result;
+  std::istringstream sin(scanner->Next());
+  uint32_t sz;
+  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+  auto &result = *result_ptr;
+  result.resize(sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    DeserializeFromStream(sin, &result[i], dev_ctx);
+  }
+
+  return true;
 }
 
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 4a2729373b5c63176ed1e856f4acf29fd1e73254..e9b473d547252e80ed26ec61e1a33fbe1742dbe0 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
-extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
+extern bool ReadFromRecordIO(recordio::Scanner* scanner,
+                             const platform::DeviceContext& dev_ctx,
+                             std::vector<LoDTensor>* result_ptr);
 
 /*
  * Convert between length-based LoD and offset-based LoD.
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 38d3cd96d65f0a54b0ea87b4c677013f3802adfb..cd50aaa26054b78f1b1e8f0d470b397892155a2b 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -301,11 +301,12 @@ static void TestRecordIO() {
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(&scanner, ctx);
+    std::vector<framework::LoDTensor> tensors;
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
     ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
     ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 5897d320a8b7e5af541098cadff8e78f8324949c..40eafda9bf294f7e8ddd067e9014447f4de1cc0e 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -67,7 +67,8 @@ void ReaderBase::Start() {
   }
 }
 
-ReaderBase::~ReaderBase() { Shutdown(); }
+ReaderBase::~ReaderBase() {}
 
+DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index a8d04feb42456607159bcbede0574fe90dfe995c..82562bf883d88787858912f7039cf8fef003eccf 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -25,8 +25,6 @@
 namespace paddle {
 namespace framework {
 
-enum ReaderStatus { kRunning, kStopped };
-
 class ReaderBase {
  public:
   virtual void ReadNext(std::vector<LoDTensor>* out);
@@ -48,6 +46,8 @@ class ReaderBase {
 
   virtual void StartImpl() {}
 
+  enum ReaderStatus { kRunning, kStopped };
+
   ReaderStatus status_{kRunning};
 
   mutable std::mutex mu_;
@@ -74,6 +74,8 @@ class DecoratedReader : public ReaderBase,
     reader_->InsertDecoratedReader(shared_from_this());
   }
 
+  ~DecoratedReader();
+
  protected:
   void ShutdownImpl() override { reader_->Shutdown(); }
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index f98011e896f4033ef210e0eb69f93ce7800a3cd6..ab693004cfb038fd92afd9c60e0fcb4e16b9f8a9 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 
 namespace paddle {
 namespace framework {
@@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
     os.write(out.data(), size);
   }
   {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
+    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
+
     auto* data_ptr = tensor.data<void>();
     PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
                    "Index overflow when writing tensor");
@@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
     auto ctx = platform::CPUDeviceContext();
+    size_t size =
+        tensor->numel() *
+        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
       Tensor cpu_tensor;
@@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
@@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
+      is.read(static_cast<char*>(buf), size);
     }
   }
 }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index c9eff0fc28c5ff52f902d3d5a0ebb37fa7619e9c..16c9c81258a9fdb7730b9b3e34be990798c91639 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -38,4 +38,6 @@ if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
 endif()
-add_subdirectory(api)
+if(NOT APPLE)
+  add_subdirectory(api)
+endif()
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index a4625f008c15300b88ef0bce71cd7d8aa473c9a8..b3a1075e5adf4a24bf32017574c061f36c46ba8c 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -22,8 +22,6 @@
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
 
 DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
             "Enable subgraph to TensorRT engine for acceleration");
@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+namespace inference {
+namespace analysis {
+
 class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index e9e14fb1947da059c8d126d3da182ce446f6421e..0132bf5b9c6552391aaa19542669487f42b685a7 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -45,14 +45,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
 
 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
 
+namespace inference {
+namespace analysis {
+
 class Analyzer : public OrderedRegistry<PassManager> {
  public:
   // Register all the pass-managers.
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index d7c1a72932a39f878add2bb884e280b91d3c38c0..25a440e7e71fddb38cc515f99d15231675a8172e 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,13 +13,21 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, main) {
+TEST_F(DFG_Tester, analysis_without_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+TEST_F(DFG_Tester, analysis_with_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index bd24e8a7d9c20b8cd9c4e41a76ffc33a004a9a69..8a3af0a8ebd5bad7be7046fa399cca4920da3d71 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
   return stack_.top();
 }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
+
 GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
     const std::vector<Node *> &source) {
   PADDLE_ENFORCE(!source.empty(),
                  "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
   std::unordered_set<Node *> visited;
   std::unordered_set<Node *> to_visit{source.begin(), source.end()};
 
@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
   while (!to_visit.empty()) {
     std::vector<Node *> queue(to_visit.begin(), to_visit.end());
     for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
       inlink_visited.clear();
 
       std::copy_if(p->inlinks.begin(), p->inlinks.end(),
@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
   return sorted_[cursor_];
 }
 
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 5dd914d1971bfb5bcc0b1db41d73e2b67120bc06..1c60d5de21538043962cc58a6f508aea635fe8c4 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {
 
    private:
     std::vector<Node *> sorted_;
-    int cursor_{0};
+    size_t cursor_{0};
   };
 
   explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 29ca008123addf07959b965a4b54bf55b18c401d..2328d870422c5a31c22d7b09980aae35e01b2b25 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -22,14 +22,18 @@
 
 namespace paddle {
 namespace inference {
+
+DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
+
 namespace analysis {
 
 using framework::proto::ProgramDesc;
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);
 
-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
   PADDLE_ENFORCE(!argument->transformed_program_desc);
@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
 
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
-    if (it->deleted()) continue;
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  LOG(INFO) << "graph.inputs " << graph->inputs.size();
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.deleted()) continue;
 
-    switch (it->type()) {
+    switch (node.type()) {
       case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
-        AddFluidOp(&(*it));
+        LOG(INFO) << "add function " << node.repr();
+        AddFluidOp(&node);
       } break;
       case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
-        AddEngineOp(&(*it));
+        LOG(INFO) << "add engine op " << node.repr() << " , "
+                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
+        AddEngineOp(&node);
       } break;
       default:
         continue;
     }
   }
+
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }
 
-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
+  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
   // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
+  // by analysis phrase.
   // The inputs and outputs of the existing ops are not changed by tensorrt
   // subgraph pass.
   // NOTE It might be changed by other passes in the long run.
 }
 
-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
-                       const framework::proto::BlockDesc& block) {
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+                       const framework::proto::BlockDesc &block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
   std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
+  for (auto *x : func->inlinks) {
     io.push_back(x->name());
   }
   desc.SetInput("Xs", io);
 
   // collect outputs
   io.clear();
-  for (auto* x : func->outlinks) {
+  for (auto *x : func->outlinks) {
     io.push_back(x->name());
   }
   desc.SetOutput("Ys", io);
-
   desc.SetType("tensorrt_engine");
+
+  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
   // Set attrs
   SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
-          "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
+  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
   std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
     if (!node->IsValue()) continue;
     PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
     framework::proto::VarDesc var;
@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
   return parameters;
 }
 
-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   // TODO(Superjomn) Here need to expose some arguments for default setting.
   PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
   framework::proto::BlockDesc proto;
   framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "origin variable size: "
+            << argument_->origin_program_desc->blocks(0).vars().size();
+  LOG(INFO) << "transformed variable size: "
+            << block_desc.Proto()->vars().size();
   // copy ops.
-  for (auto* node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+  for (auto *node : block_node->subgraph) {
+    auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
   CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
   op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +165,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
 
-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_inference_analysis_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index edc84b02ed20991e3e7c6c437d2b1fac169bae03..59c47365aa6c8ad5886c4515850d264f69cc4670 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -26,6 +26,10 @@
 
 namespace paddle {
 namespace inference {
+
+DECLARE_int32(tensorrt_max_batchsize);
+DECLARE_int32(tensorrt_workspace_size);
+
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 162455b9c4e06b7fbb4bdede30444faf6a8a1509..65842b1e850953e77e3d4d28416609be271af9f1 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index e918622d74cfb11d83090555be2a768cc14e7742..496921db9eabce1b1e40c7cb13089446ca93321c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
   PADDLE_ENFORCE(argument);
   if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
     argument->main_dfg.reset(new DataFlowGraph);
   }
   desc_ = argument->origin_program_desc.get();
@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
+
   for (int i = 0; i < main_block.ops_size(); i++) {
     const auto &op = main_block.ops(i);
     auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     o->SetPbMsg(op.SerializeAsString());
 
     // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    std::unordered_set<Node *> inlinks;
     for (int j = 0; j < op.inputs_size(); j++) {
       auto &in_var = op.inputs(j);
       for (int k = 0; k < in_var.arguments_size(); k++) {
         auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
         in->outlinks.push_back(o);
         o->inlinks.push_back(in);
+        inlinks.insert(in);
       }
     }
     for (int j = 0; j < op.outputs_size(); j++) {
       auto &out_var = op.outputs(j);
       for (int k = 0; k < out_var.arguments_size(); k++) {
         auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (inlinks.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
         out->inlinks.push_back(o);
         o->outlinks.push_back(out);
       }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index cbca5abdd5fff1672ba5d47a8876489c54ad6947..dadb84059d21adab44159a6145b345460663cb96 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -24,12 +24,12 @@ namespace analysis {
 TEST_F(DFG_Tester, Init) {
   FluidToDataFlowGraphPass pass;
   pass.Initialize(&argument);
-  DataFlowGraph graph;
-  pass.Run(&graph);
+  pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37UL);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
   pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs.empty());
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index 9993de22800bc0aafdcbf46618e6b479ac1eb187..faf876de6d65d20cf7a084cd97392cfc8d791a42 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
   SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 9d63d08dedf6a1bcdacc51bb83d2ed261bca4117..e28e144fd54cec06b0228ac9c478de7c641455a0 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -82,7 +82,7 @@ inference_api_test(test_api_impl
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
         SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
 
 inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
 endif()
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index f6f3cb335897b02905e24c229b92f3940a37dbf8..0206ac60103759deda91be741617bde63e003de6 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
 
 bool PaddleInferenceAnakinPredictor::Run(
     const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
+    std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
     if (input.dtype != PaddleDType::FLOAT32) {
       LOG(ERROR) << "Only support float type inputs. " << input.name
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 85ca83cd00756cca04d7b92437e9955d8ab297e7..def096c867ec85624f5b221782ef8b6240923c05 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
   bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 786dc8e827806a9cea9dc01788fada2fd754b930..3ae255e13fc4f3ca28a6af62a5d5944d84303fc7 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 }
 
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
   VLOG(3) << "Predictor::predict";
   Timer timer;
   timer.tic();
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 92e693578ab657004f3c40c09b979897afea1e1f..4f28c3cd34bade4189871210e6168c6c1c610c2c 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
   bool Init(std::shared_ptr<framework::Scope> parent_scope);
 
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 0cdc88fa1eaf3935ce0da143e1e91eb84cd70dcf..c0891e9c281961fa03d278a0f5c676f92672c419 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 
@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
       return false;
     }
 
-    // Analyze inference_program
-    Argument argument;
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
+    OptimizeInferenceProgram();
     ctx_ = executor_->Prepare(*inference_program_, 0);
 
     VLOG(5) << "to create variables";
@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
     return true;
   }
 
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram() {
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
+  }
+
  private:
   TensorRTConfig config_;
 };
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index b8ba2d14a5c161d491d838888ea14b776f769f23..2f8b4f8596946988a728b5cf82de251bfda778a9 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -98,7 +98,8 @@ class PaddlePredictor {
   // responsible for the output tensor's buffer, either allocated or passed from
   // outside.
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
 
   // Clone a predictor that share the model weights, the Cloned predictor should
   // be thread-safe.
diff --git a/paddle/fluid/inference/api/test_api.cc b/paddle/fluid/inference/api/test_api.cc
index ac8a21a22be6f27311b8ae2507d04d9d1b510e76..7a579610eefda24c911edd28b5f3a178aa10ab1e 100644
--- a/paddle/fluid/inference/api/test_api.cc
+++ b/paddle/fluid/inference/api/test_api.cc
@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
     LOG(INFO) << "I get other_config " << config.other_config;
   }
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
     LOG(INFO) << "Run";
     return false;
   }
diff --git a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
index 585f6d29376c3341c21ff76361d5335512c1b1b6..62d98a796708612e7d4ff8abfd85125978ce22c7 100644
--- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
@@ -15,50 +15,79 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
-void Main(bool use_gpu) {
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+
   //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor =
+  NativeConfig config0;
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.use_gpu = true;
+  config0.fraction_of_gpu_memory = 0.3;
+  config0.device = 0;
+
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
       CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
 
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
     //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor{
+        .name = "",
+        .shape = std::vector<int>({10, 1}),
+        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
+        .dtype = PaddleDType::INT64};
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
 
     //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
+    std::vector<PaddleTensor> outputs0;
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));
 
     //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    ASSERT_EQ(outputs0.size(), 1UL);
+    ASSERT_EQ(outputs1.size(), 1UL);
+
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+    EXPECT_EQ(num_elements, num_elements1);
+
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
     }
   }
 }
 
-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 6697952051c4b1997ca6b550da17a52e64cb3454..968f7eb99ce8519edaa585fd3cb642bd80cc63cc 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -93,6 +93,10 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
   struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
     trt_##op_type__##_converter() {                                            \
@@ -111,7 +115,3 @@ class OpConverter {
   extern int TouchConverterRegister_##op_type__();                      \
   static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
       TouchConverterRegister_##op_type__();
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a..fefec0df6d03669a294ce9643b666d7416593708 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -26,18 +26,20 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void TensorRTEngine::Build(const DescType& paddle_model) {
+void TensorRTEngine::Build(const DescType &paddle_model) {
   PADDLE_ENFORCE(false, "not implemented");
 }
 
 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
-  for (auto& buf : buffers_) {
+  batch_size_ = batch_size;
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
     PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
     PADDLE_ENFORCE_GT(buf.max_size, 0);
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
   cudaStreamSynchronize(*stream_);
 }
@@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) {
 TensorRTEngine::~TensorRTEngine() {
   cudaStreamSynchronize(*stream_);
   // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
       PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
       buf.buffer = nullptr;
@@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() {
 
   // allocate GPU buffers.
   buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
       auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
                     analysis::AccuDims(dims.d, dims.nbDims);
+      PADDLE_ENFORCE_GT(item.second, 0);
     }
-    auto& buf = buffer(item.first);
+
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
-            << buf.buffer;
-    buf.size = buf.max_size = item.second;
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
+    // buf.size will changed in the runtime.
+    buf.size = 0;
     buf.device = DeviceType::GPU;
   }
 }
 
-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
   buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                         analysis::AccuDims(dims.d, dims.nbDims);
@@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
   return input;
 }
 
-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                                   const std::string& name) {
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
+                                   const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
   SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
@@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
   buffer_sizes_[name] = 0;
 }
 
-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   PADDLE_ENFORCE(!output->isNetworkInput());
@@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   buffer_sizes_[name] = 0;
 }
 
-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
   return buffer(name).buffer;
 }
 
-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                     size_t max_size) {
   // determine data size
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
   PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
                                     cudaMemcpyDeviceToDevice, *stream_),
                     0);
 }
 
-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                     size_t max_size) {
+  VLOG(4) << "get output in cpu";
+  auto &buf = buffer(name);
+
+  // Update needed buffer size.
+  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
+  auto dims = infer_engine_->getBindingDimensions(slot_offset);
+  buf.size = kDataTypeSize[static_cast<int>(
+                 infer_engine_->getBindingDataType(slot_offset))] *
+             analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE_LE(buf.size, buf.max_size);
   // determine data size
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
-                                       cudaMemcpyDeviceToHost, *stream_));
+  // DEBUG
+  memset(dst, 0, buf.size);
+  PADDLE_ENFORCE_EQ(
+      0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
 }
 
-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
   PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                        cudaMemcpyHostToDevice, *stream_));
 }
 
-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                        cudaMemcpyDeviceToDevice, *stream_));
 }
 
-void TensorRTEngine::SetITensor(const std::string& name,
-                                nvinfer1::ITensor* tensor) {
+void TensorRTEngine::SetITensor(const std::string &name,
+                                nvinfer1::ITensor *tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
   PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
   PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0..7064d333f6db754f88c0ac6956a9527a48bf866c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
         stream_(stream ? stream : &default_stream_),
-        logger_(logger) {}
+        logger_(logger) {
+    cudaStreamCreate(&default_stream_);
+  }
 
   virtual ~TensorRTEngine();
 
@@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase {
   int max_batch_;
   // the max memory size the engine uses
   int max_workspace_;
+
+  // batch size of the current data, will be updated each Executation.
+  int batch_size_{-1};
   cudaStream_t* stream_;
   // If stream_ is not set from outside, hold its own stream.
   cudaStream_t default_stream_;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index e635f0f87d577a1f1ac74687ee60f762be525418..fca3488008ed83418b5e28b8af42d8019aaaa2a4 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
 
   LOG(INFO) << "to get output";
   float y_cpu[2] = {-1., -1.};
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 2);
+  ASSERT_EQ(dims.d[1], 1);
   engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
   ASSERT_EQ(y_cpu[0], 4.5);
   ASSERT_EQ(y_cpu[1], 14.5);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d265150f25419509126028e36e629aee3ee6bd0f..4e2002ad24415437ae4f85eba0e90a6c689e2996 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -168,6 +168,8 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      DEPS tensorrt_engine_op
       analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 6bd3e491bccb037406b784147dc9f91049b34d53..5edecd18e673da326ec119cf9a383f24f8045089 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -24,15 +24,16 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Predict"),
+                   "Input of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
                    "Input of Label should not be null.");
-    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto predict_width = ctx->GetInputDim("Predict")[1];
+    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    auto predict_height = ctx->GetInputDim("Predict")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
-    PADDLE_ENFORCE_EQ(inference_height, label_height,
+    PADDLE_ENFORCE_EQ(predict_height, label_height,
                       "Out and Label should have same height.");
 
     int num_thres = ctx->Attrs().Get<int>("num_thresholds");
@@ -43,14 +44,14 @@ class AucOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("FPOut", {num_thres});
     ctx->SetOutputDim("FNOut", {num_thres});
 
-    ctx->ShareLoD("Out", /*->*/ "AUC");
+    ctx->ShareLoD("Predict", /*->*/ "AUC");
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
         ctx.device_context());
   }
 };
@@ -58,18 +59,13 @@ class AucOp : public framework::OperatorWithKernel {
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Out",
-             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is sorted in descending order. This input should be the"
-             "output of topk."
+    AddInput("Predict",
+             "A floating point 2D tensor with shape [batch_size, 2], values "
+             "are in the range [0, 1]."
              "Typically, this tensor indicates the probability of each label");
-    AddInput("Indices",
-             "An int 2D tensor, indicating the indices of original"
-             "tensor before sorting. Typically, this tensor indicates which "
-             "label the probability stands for.");
     AddInput("Label",
-             "A 2D int tensor indicating the label of the training data."
-             "The height is batch size and width is always 1.");
+             "A 2D int tensor indicating the label of the training data. "
+             "shape: [batch_size, 1]");
     AddInput("TP", "True-Positive value.");
     AddInput("FP", "False-Positive value.");
     AddInput("TN", "True-Negative value.");
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index 58fefc1600dfb7df3e3d71959c047865ed5e2e39..0a18585edb54a76aff5ae72ecc71e0eebb9f9361 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -31,7 +31,7 @@ template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
+    auto* predict = ctx.Input<Tensor>("Predict");
     auto* label = ctx.Input<Tensor>("Label");
     auto* auc = ctx.Output<Tensor>("AUC");
     // Only use output var for now, make sure it's persistable and
@@ -41,24 +41,24 @@ class AucKernel : public framework::OpKernel<T> {
     auto* true_negative = ctx.Output<Tensor>("TNOut");
     auto* false_negative = ctx.Output<Tensor>("FNOut");
 
-    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
 
     std::string curve = ctx.Attr<std::string>("curve");
     int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<float> thresholds_list;
+    std::vector<double> thresholds_list;
     thresholds_list.reserve(num_thresholds);
     for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
     }
-    const float kEpsilon = 1e-7;
+    const double kEpsilon = 1e-7;
     thresholds_list[0] = 0.0f - kEpsilon;
     thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
 
-    size_t batch_size = inference->dims()[0];
-    size_t inference_width = inference->dims()[1];
+    size_t batch_size = predict->dims()[0];
+    size_t inference_width = predict->dims()[1];
 
-    const T* inference_data = inference->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
+    const T* inference_data = predict->data<T>();
+    const auto* label_data = label->data<int64_t>();
 
     auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
     auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
@@ -66,20 +66,19 @@ class AucKernel : public framework::OpKernel<T> {
     auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
 
     for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // caculate TP, FN, TN, FP for current thresh
+      // calculate TP, FN, TN, FP for current thresh
       int64_t tp = 0, fn = 0, tn = 0, fp = 0;
       for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels >0 will be treated as true.
+        // NOTE: label_data used as bool, labels > 0 will be treated as true.
         if (label_data[i]) {
-          // use first(max) data in each row
-          if (inference_data[i * inference_width] >=
+          if (inference_data[i * inference_width + 1] >=
               (thresholds_list[idx_thresh])) {
             tp++;
           } else {
             fn++;
           }
         } else {
-          if (inference_data[i * inference_width] >=
+          if (inference_data[i * inference_width + 1] >=
               (thresholds_list[idx_thresh])) {
             fp++;
           } else {
@@ -94,21 +93,21 @@ class AucKernel : public framework::OpKernel<T> {
       fp_data[idx_thresh] += fp;
     }
     // epsilon to avoid divide by zero.
-    float epsilon = 1e-6;
+    double epsilon = 1e-6;
     // Riemann sum to caculate auc.
     Tensor tp_rate, fp_rate, rec_rate;
     tp_rate.Resize({num_thresholds});
     fp_rate.Resize({num_thresholds});
     rec_rate.Resize({num_thresholds});
-    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
-    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
-    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
+    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
+    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
+    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
     for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
                         (tp_data[i] + fn_data[i] + epsilon);
       fp_rate_data[i] =
-          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
                          (tp_data[i] + fp_data[i] + epsilon);
     }
     *auc_data = 0.0f;
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 675ca36774beb72cc1e9b136ad0b18ce061689ac..6555b8101a90bba8351d2c82313ab12e572a01ee 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -1,33 +1,43 @@
+if(NOT WITH_DISTRIBUTE)
+    return()
+endif()
+
+if(WITH_GRPC)
+    set(cc_generic_services "false")
+else()
+    set(cc_generic_services "true")
+endif()
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+
 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
-      selected_rows memory)
+  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+      PROTO send_recv.proto 
+      DEPS lod_tensor selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
-          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-          proto_desc lookup_table_op SERIAL)
+  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
+     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(grpc_server_test SRCS rpc_server_test.cc 
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_table_op SERIAL)
   return()
 endif()
 
 
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
+    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
   PROTO send_recv.proto
   DEPS lod_tensor selected_rows memory)
 
-find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
-
+set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
-find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+cc_test(brpc_server_test SRCS rpc_server_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
 
-cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
-       brpc protobuf leveldb gflags glog
-       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
+cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/bytebuffer_stream.cc
rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
index 6e91b447db838c9095432eda22e9e1171e938d31..d192f54ee0c924b772045d9b6a01701f640e07c7 100644
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
similarity index 87%
rename from paddle/fluid/operators/distributed/bytebuffer_stream.h
rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
index e7de172c79c30761483b5d96f5bad19860208832..e9074574cdd163bbf7e62939df9283352706f840 100644
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "grpc++/grpc++.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@@ -107,25 +108,6 @@ class GrpcBufferReader final
 namespace paddle {
 namespace operators {
 namespace distributed {
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
 
 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 4d60801b6a6ecaabf1165321e0cb19044d27aa34..52c4bc1e7965323438de959d5eb1f3b4ef4f4cfe 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index d03a3e56aedbe4a008ee9ff187111f7635d14b58..11de84d9e265b2ca75d6d72a1d1e8797763f96a5 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -38,7 +38,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
@@ -46,23 +49,6 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-struct VarHandle {
-  // RPC endpoint.
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  // Variable name.
-  std::string name;
-  // RPC method name.
-  std::string method;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
 class BaseProcessor {
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f8796713a6b89a308113981614673e07e8d367f
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  VarMsg request;
+  void* payload = nullptr;
+  size_t payload_size;
+
+  request.set_varname(name);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
+#endif
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
+#ifdef PADDLE_WITH_CUDA
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
+  }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size, [](void* backing) {},
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var) {
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  *var = resp.GetVar();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
new file mode 100644
index 0000000000000000000000000000000000000000..450c41dcd6b1bf9a33d3bbef3a1c94a2f83ff322
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+typedef void (*DestroyCallback)(void*);
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
index 3d107b533bcb7bfef3f9b13ec99afbd579a62e52..96ea05e74ed76768248a27ab435dc801b7d1b995 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -21,8 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -84,7 +86,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
   framework::Scope scope;
   scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
   EXPECT_EQ(resp.Parse(msg), 0);
 
   framework::Variable* var2 = resp.GetVar();
@@ -171,7 +173,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // deserialize zero-copy
   framework::Scope scope;
   scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
   if (from_type == 0) {
     EXPECT_EQ(resp.Parse(msg), 0);
   } else {
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41..8edb00276df3ade1b320fbf2873e8b54ff3e1464 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <limits>
 #include <string>
 
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc_server.h"
 
 using ::grpc::ServerAsyncResponseWriter;
@@ -84,9 +85,9 @@ class RequestSend final : public RequestBase {
                        ::grpc::ServerCompletionQueue* cq,
                        RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(),
-                                        !request_handler->sync_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(),
+                                            !request_handler->sync_mode()));
     int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -109,7 +110,7 @@ class RequestSend final : public RequestBase {
 
  protected:
   sendrecv::VoidMessage reply_;
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
@@ -161,8 +162,8 @@ class RequestPrefetch final : public RequestBase {
       : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
         local_scope_(nullptr) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(), true));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
     service_->RequestAsyncUnary(
@@ -194,7 +195,7 @@ class RequestPrefetch final : public RequestBase {
   }
 
  protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
   framework::Scope* local_scope_;
@@ -206,8 +207,8 @@ class RequestCheckpointNotify final : public RequestBase {
                                    ::grpc::ServerCompletionQueue* cq,
                                    RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx()));
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
     service_->RequestAsyncUnary(
@@ -234,7 +235,7 @@ class RequestCheckpointNotify final : public RequestBase {
   }
 
  protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   sendrecv::VoidMessage reply_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
index cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0..9ae9a31a003cbb1f808fd1127a5dd78511aa3e99 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -23,8 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // NOTE: This method was originally created by tensorflow
@@ -42,17 +41,18 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::VariableResponse.
 // Wire-format is identical to RecvVariableResponse.
 template <>
-class SerializationTraits<paddle::operators::distributed::VariableResponse> {
+class SerializationTraits<
+    paddle::operators::distributed::GRPCVariableResponse> {
  public:
   static Status Serialize(
-      const paddle::operators::distributed::VariableResponse& msg,
+      const paddle::operators::distributed::GRPCVariableResponse& msg,
       grpc_byte_buffer** bp, bool* own_buffer) {
     PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
     return Status();
   }
   static Status Deserialize(
       grpc_byte_buffer* buffer,
-      paddle::operators::distributed::VariableResponse* msg,
+      paddle::operators::distributed::GRPCVariableResponse* msg,
       int max_message_size = INT_MAX) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34d47f3ec0f3025109447b66078b724607d2953a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -0,0 +1,308 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return tag;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int GRPCVariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return tag;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!ProcSerializedField(tag, &input, num_bytes)) {
+          return tag;
+        }
+
+        break;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return tag;
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc_variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..89df07c92cd33bcb76c8539b5566d74fa21bba5e
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.h
@@ -0,0 +1,58 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class GRPCVariableResponse : public VariableResponse {
+ public:
+  GRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~GRPCVariableResponse() {}
+
+  int Parse(Source* source) override;
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 271306d5d20f1b849a81a9bfa6436f2faf261204..3d61171dff98d6752be98b4b90577bfd059525ab 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -51,6 +51,23 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 
 class RPCServer;
 
+struct VarHandle {
+  // RPC endpoint.
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  // Variable name.
+  std::string name;
+  // RPC method name.
+  std::string method;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
 class RequestHandler {
  public:
   explicit RequestHandler(bool sync_mode)
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6..f1f84072d47e58eaa81dd66dc018e17b182bb57b 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -53,7 +53,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
 
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv batch barrier message";
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
   } else if (varname == BEGIN_PASS_MESSAGE) {
     VLOG(3) << "sync: recv begin pass message";
@@ -65,8 +65,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
     VLOG(3) << "sync: processing received var: " << varname;
 
     if (invar == nullptr) {
-      LOG(ERROR) << "sync: Can not find server side var: " << varname;
-      PADDLE_THROW("sync: Can not find server side var");
+      LOG(FATAL) << "sync: Can not find server side var: " << varname;
       return false;
     }
     if (invar->IsType<framework::SelectedRows>()) {
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto.in
similarity index 97%
rename from paddle/fluid/operators/distributed/send_recv.proto
rename to paddle/fluid/operators/distributed/send_recv.proto.in
index e0902320cff003797b12ed0204f7f99c44554b62..8b0a09abe1d05dda10eda0030eb91cb9ca40683e 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -1,3 +1,4 @@
+
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -14,7 +15,7 @@ limitations under the License. */
 syntax = "proto3";
 package sendrecv;
 
-// option cc_generic_services = true;
+option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
   // For parameter server round-robin like hashing, do not split tensors.
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 98129d9f1014c39347e3409533f2bc10092611d2..98a5dcbbb871f90db8b685da93f6baaec209fab8 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -12,21 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
 #include <sys/time.h>
 #include <thread>  // NOLINT
 
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -34,6 +28,11 @@ namespace distributed {
 
 using VarMsg = sendrecv::VariableMessage;
 
+void* GetVarPayLoad(const std::string varname, int64_t size) {
+  platform::CUDAPinnedPlace cuda_pinned;
+  return memory::Alloc(cuda_pinned, size);
+}
+
 void GetTensorPayload(framework::Variable* var,
                       const platform::DeviceContext& ctx, VarMsg* request,
                       void** payload, size_t* payload_size) {
@@ -58,15 +57,17 @@ void GetTensorPayload(framework::Variable* var,
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-    platform::CUDAPinnedPlace cuda_pinned;
+    // platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
 
+    platform::CUDAPinnedPlace cuda_pinned;
     memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor.place()),
                  reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
                  gpu_dev_ctx.stream());
+
     ctx.Wait();
 #endif
   } else {
@@ -91,10 +92,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
   auto* tensor = slr->mutable_value();
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
-    platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
+
+    platform::CUDAPinnedPlace cuda_pinned;
     memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor->place()),
                  reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
@@ -107,126 +109,6 @@ void GetSelectedRowsPayload(framework::Variable* var,
   *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
 }
 
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_name) {
-  // Default DestroyCallback does nothing, When using GPU
-  // the CPU buffer need to be freed.
-  DestroyCallback destroy_callback = [](void* backing) {};
-  VarMsg request;
-  void* payload = nullptr;
-  size_t payload_size;
-
-  request.set_varname(name);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    // GPU data is copied to CPU buffer when sending,
-    // free the buffer when possible.
-    destroy_callback = [](void* backing) {
-      platform::CUDAPinnedPlace cuda_pinned;
-      memory::Free(cuda_pinned, backing);
-    };
-#endif
-  }
-
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
-                                    static_cast<char*>(payload)),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var) {
-  operators::distributed::VariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  *var = resp.GetVar();
-}
-
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index fe25e73fa608727ba0bb912a82776b330ec8d83a..4d08d3c77afa3c1f2b4d7602f7199558bb5a79c0 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -25,24 +25,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-typedef void (*DestroyCallback)(void*);
+using VarMsg = sendrecv::VariableMessage;
 
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string());
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size);
 
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var);
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size);
 
 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
   switch (type) {
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 45832c60bf9172497afabac927ba39a7cbfb9a52..466bce18af7cf97014a7b1ba64df68eab193c7c8 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,50 +13,20 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/variable_response.h"
-
-#include <string>
-#include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include "paddle/fluid/platform/profiler.h"
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-             const platform::DeviceContext& dev_ctx, platform::Place place,
-             void* dest, int size) {
+bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
+                               const platform::DeviceContext& dev_ctx,
+                               platform::Place place, void* dest,
+                               int64_t size) {
   const void* data = NULL;
   int size_to_write = 0;
-  int length = size;
+  int64_t length = size;
   int total_written = 0;
 
   if (platform::is_gpu_place(place)) {
@@ -194,294 +164,49 @@ bool VariableResponse::CopySelectRowsData(
   return true;
 }
 
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
+bool VariableResponse::ProcSerializedField(
+    int tag, ::google::protobuf::io::CodedInputStream* input,
+    int64_t num_bytes) {
+  PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                  meta_.type() == sendrecv::LOD_TENSOR ||
+                  meta_.type() == sendrecv::NCCL_ID) &&
+                     meta_.varname() != "",
+                 "meta info should be got first!");
 
+  if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+    auto* var = scope_->FindVar(meta_.varname());
+    if (var != nullptr) {
+      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                   num_bytes)) {
         return false;
       }
-      default: { return false; }
     }
-  }
-
-  return true;
-}
-
-int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-int VariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return false;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR ||
-                        meta_.type() == sendrecv::NCCL_ID) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
-          auto* var = scope_->FindVar(meta_.varname());
-          if (var != nullptr) {
-            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                         num_bytes)) {
-              return tag;
-            }
-          }
-          break;
+    return true;
 #else
-          PADDLE_THROW("Not compiled with CUDA!");
+    PADDLE_THROW("Not compiled with CUDA!");
+    return false;
 #endif
-        }
-
-        framework::DDim dims = GetDims(meta_.dims());
-        if (meta_.type() == sendrecv::LOD_TENSOR) {
-          PADDLE_ENFORCE(meta_.lod_size() >= 0,
-                         "lod info should be got first!");
-          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        if (meta_.type() == sendrecv::SELECTED_ROWS) {
-          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        return tag;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
+  }
 
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
+  framework::DDim dims = GetDims(meta_.dims());
+  if (meta_.type() == sendrecv::LOD_TENSOR) {
+    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
+    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
+    }
+    return true;
+  }
 
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          // TODO(panyx0718): Should we allow to customize file dir.
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
-        }
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
+  if (meta_.type() == sendrecv::SELECTED_ROWS) {
+    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
     }
+    return true;
   }
 
-  return 0;
+  return true;
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 1db4a0a522654ff2497b8bd9ee1381b5ab64067a..6aec52ca00f59a42ecca01da8df1680ce4eda432 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -22,18 +22,35 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
+
 class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
@@ -51,22 +68,19 @@ class VariableResponse {
     }
   }
 
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(Source* source);
+  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
+    meta_ = meta;
+    return Parse(source);
+  }
 
   // return:
   // 0:ok.
   // -1: unkown error.
   // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-
-  const framework::Scope& GetLocalScope() const { return *local_scope_; }
-
-  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
+  virtual int Parse(Source* source) = 0;
 
+  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
+  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
 
@@ -78,7 +92,11 @@ class VariableResponse {
     return scope_->FindVar(meta_.varname());
   }
 
- private:
+ protected:
+  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+               const platform::DeviceContext& dev_ctx, platform::Place place,
+               void* dest, int64_t size);
+
   bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
                                 const platform::DeviceContext& ctx,
                                 const framework::DDim& dims, int length);
@@ -90,12 +108,16 @@ class VariableResponse {
                          const platform::DeviceContext& ctx,
                          const framework::DDim& dims, int length);
 
- private:
+  bool ProcSerializedField(int tag,
+                           ::google::protobuf::io::CodedInputStream* input,
+                           int64_t num_bytes);
+
+ protected:
   const framework::Scope* scope_;
   const platform::DeviceContext* dev_ctx_;
   bool create_scope_ = false;
   framework::Scope* local_scope_ = nullptr;
-  // only Skeleton
+
   sendrecv::VariableMessage meta_;
 };
 
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 238bd3f8def9eaa6c18afdab1031c4babfde8ae2..a0802ef90ca7e30a2b22d187cb9092163518d8e9 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -37,6 +37,7 @@ struct CBlas<float> {
     libxsmm_sgemm(args...);
   }
 #endif
+
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
     platform::dynload::cblas_saxpy(args...);
@@ -76,6 +77,7 @@ struct CBlas<double> {
     libxsmm_dgemm(args...);
   }
 #endif
+
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
     platform::dynload::cblas_daxpy(args...);
@@ -150,6 +152,7 @@ struct CBlas<double> {
   }
 };
 #endif
+
 template <>
 struct CBlas<platform::float16> {
   static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
@@ -190,30 +193,48 @@ inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
   return false;
 }
 
-template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                            CBLAS_TRANSPOSE transB, int M,
-                                            int N, int K, T alpha, const T *A,
-                                            const T *B, T beta, T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
+inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
+                      CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
+                      const T *A, int lda, const T *B, int ldb, T beta, T *C,
+                      int ldc) {
 #ifdef PADDLE_WITH_LIBXSMM
-  if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
-              beta)) {
+  if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
+                 beta)) {
     // Note: SMM use ColMajor
     const char transa = 'N';
     const char transb = 'N';
     CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
                        &beta, C, &ldc);
-  } else {
+    return;
+  }
 #endif
-    CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
-                   ldb, beta, C, ldc);
-#ifdef PADDLE_WITH_LIBXSMM
+
+#ifdef PADDLE_MKL_SPLIT_GEMM
+  constexpr int bs = 2;
+  if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
+    for (int off = 0; off < M; off += bs) {
+      CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
+                     A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
+    }
+    return;
   }
 #endif
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            const T *B, T beta, T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+               beta, C, ldc);
 }
 
 template <>
@@ -222,9 +243,9 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                             int N, int K, T alpha, const T *A,
                                             int lda, const T *B, int ldb,
                                             T beta, T *C, int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-                 lda, B, ldb, beta, C, ldc);
+  GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+               lda, B, ldb, beta, C, ldc);
 }
 
 template <typename DeviceContext>
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 078dd448c385dbb8a00025ee2ba08d0c41a4730a..2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -228,3 +228,57 @@ TEST(math_funciton, set_constant) {
   }
   delete ctx;
 }
+
+template <typename T>
+void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_ref;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i + 1);
+  }
+  for (int i = 0; i < mat_c_ref.numel(); ++i) {
+    CREF[i] = static_cast<T>(i + 2);
+    CMKL[i] = CREF[i];
+  }
+
+  // this would call gemm_warp
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
+                           beta, CREF);
+
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                          CblasNoTrans, m, n, k, alpha, A, lda,
+                                          B, ldb, beta, CMKL, ldc);
+
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
+  }
+}
+
+TEST(math_function, gemm_warp) {
+  GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
+  GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
+  GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
+  GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
+  GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
+  GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
+  GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
+  GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
+}
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index dcd73e3c3e40f80e07b73944d1f0cc57fea010d3..5f43c5810812260c4384349bdb709716c9a182f5 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -98,7 +98,7 @@ The update equations are as follows:
 $$
 velocity = mu * velocity + gradient \\
 if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+  param = param - (gradient + mu * velocity) * learning\_rate \\
 else:   \\
   param = param - learning\_rate * velocity. \\
 $$
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index 5eb9d9950248bb50bb823f071c7fff0ddcc47234..a3932db1f3a50305d585cd3d5e86fa1b527df78b 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
       T g_val = g[i];
       T v_new = v[i] * mu + g_val;
       v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
     }
   } else {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 04a1929b84a93af6465bacfe7974a1530296946d..264726040fb566a52b8c0cdee0a1524197d2a675 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
 
     v_out = v * mu + g;
     if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
+      p_out = p - (g + v_out * mu) * lr[0];
     } else {
       p_out = p - lr[0] * v_out;
     }
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 9dbcc35e6f5bb01c159980a49dd4b4c9d37d2aab..728197377df04df8c993a48bc282431473fe9959 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -15,12 +15,13 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
-reader_library(open_files_op SRCS open_files_op.cc)
+cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
 reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
-reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
+reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba1b3d3e583868c5c360d4766e14c15538b22b33
--- /dev/null
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+BufferedReader::~BufferedReader() { reader_->Shutdown(); }
+BufferedReader::BufferedReader(
+    const std::shared_ptr<framework::ReaderBase> &reader,
+    const platform::Place &place, size_t buffer_size)
+    : framework::DecoratedReader(reader),
+      thread_pool_(1),
+      place_(place),
+      buffer_size_(buffer_size) {
+  cpu_buffer_.resize(buffer_size);
+  gpu_buffer_.resize(buffer_size);
+  ReadTillBufferFullAsync();
+}
+void BufferedReader::ReadTillBufferFullAsync() {
+  PADDLE_ENFORCE_EQ(position_.size(), 0U);
+  for (size_t i = 0; i < buffer_size_; ++i) {
+    ReadAsync(i);
+  }
+}
+void BufferedReader::ReadAsync(size_t i) {
+  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
+    TensorVec &cpu = cpu_buffer_[i];
+    reader_->ReadNext(&cpu);
+
+    if (cpu.empty()) {
+      return -1UL;
+    }
+
+    if (platform::is_gpu_place(place_)) {
+      TensorVec &gpu = gpu_buffer_[i];
+      gpu.resize(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        framework::TensorCopySync(cpu[i], place_, &gpu[i]);
+        gpu[i].set_lod(cpu[i].lod());
+      }
+    }
+    return i;
+  }));
+}
+void BufferedReader::ShutdownImpl() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.pop();
+  }
+  prev_pos_ = -1UL;
+}
+void BufferedReader::StartImpl() {
+  reader_->Start();
+  ReadTillBufferFullAsync();
+}
+void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
+  if (position_.empty()) {
+    out->clear();
+    return;
+  }
+  size_t i = position_.front().get();
+  position_.pop();
+
+  if (i == -1UL) {
+    ReadNextImpl(out);
+    return;
+  }
+
+  *out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i];
+
+  // Do not push current position into ReadAsync. Push the previous position
+  // Since all computation in fluid are async, change the data of
+  // current position may cause data error.
+  if (prev_pos_ != -1Ul) {
+    ReadAsync(prev_pos_);
+  }
+  prev_pos_ = i;
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbe2bc1b5fdd69d1a843b768e3289acd621369a6
--- /dev/null
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <queue>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class BufferedReader : public framework::DecoratedReader {
+  using TensorVec = std::vector<framework::LoDTensor>;
+  using VecFuture = std::future<TensorVec>;
+
+ public:
+  BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
+                 const platform::Place& place, size_t buffer_size);
+
+  ~BufferedReader() override;
+
+ private:
+  void ReadTillBufferFullAsync();
+
+  void ReadAsync(size_t i);
+
+ protected:
+  void ShutdownImpl() override;
+  void StartImpl() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  ThreadPool thread_pool_;
+  platform::Place place_;
+  const size_t buffer_size_;
+
+  std::queue<std::future<size_t>> position_;
+
+  // The buffer for reading data.
+  // NOTE: the simplest way to implement buffered reader is do not use any
+  // buffer, just read async and create futures as buffer size. However, to
+  // malloc tensors every time is extremely slow. Here we store all data in
+  // buffers and prevent alloc every time.
+  std::vector<TensorVec> cpu_buffer_;
+  std::vector<TensorVec> gpu_buffer_;
+  size_t prev_pos_{-1UL};
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 7b14370f4fd64e8fd5b8d9038006494b88d671dc..ed719f91d0980480aa62a5cd3c1f819e6c0e7475 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,83 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
-
-// 'Double buffer' means we shall maintain two batches of input data at the same
-// time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
-// There will be two bacthes out of the channel during training:
-// 1. the one waiting to be sent to the channel
-// 2. the one just be received from the channel, which is also being used by
-// subsequent operators.
-// So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
-
-class DoubleBufferReader : public framework::DecoratedReader {
- public:
-  explicit DoubleBufferReader(
-      const std::shared_ptr<ReaderBase>& reader,
-      platform::Place target_place = platform::CPUPlace())
-      : DecoratedReader(reader), place_(target_place) {
-    cpu_tensor_cache_.resize(kCacheSize);
-    gpu_tensor_cache_.resize(kCacheSize);
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place_)) {
-      for (size_t i = 0; i < kCacheSize; ++i) {
-        ctxs_.emplace_back(new platform::CUDADeviceContext(
-            boost::get<platform::CUDAPlace>(place_)));
-      }
-    }
-#endif
-    StartPrefetcher();
-  }
-
-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
-
-  ~DoubleBufferReader() { EndPrefetcher(); }
-
- private:
-  void ShutdownImpl() override {
-    EndPrefetcher();
-    reader_->Shutdown();
-  }
-
-  void StartImpl() override {
-    reader_->Start();
-    StartPrefetcher();
-  }
-
-  void StartPrefetcher() {
-    channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
-  }
-
-  void EndPrefetcher() {
-    channel_->Close();
-    if (prefetcher_.joinable()) {
-      prefetcher_.join();
-    }
-    delete channel_;
-    channel_ = nullptr;
-  }
-
-  void PrefetchThreadFunc();
-
-  std::thread prefetcher_;
-  reader::BlockingQueue<size_t>* channel_;
-  platform::Place place_;
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
-  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-};
-
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
@@ -118,8 +47,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       place = platform::CUDAPlace(static_cast<int>(num));
     }
 
-    out->Reset(framework::MakeDecoratedReader<DoubleBufferReader>(
-        underlying_reader, place));
+    out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
+                                                              place, 2));
   }
 };
 
@@ -146,51 +75,6 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void DoubleBufferReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
-  size_t cached_tensor_id;
-  if (channel_->Receive(&cached_tensor_id)) {
-    if (platform::is_gpu_place(place_)) {
-      *out = gpu_tensor_cache_[cached_tensor_id];
-    } else {
-      // CPU place
-      *out = cpu_tensor_cache_[cached_tensor_id];
-    }
-  } else {
-    out->clear();
-  }
-}
-
-void DoubleBufferReader::PrefetchThreadFunc() {
-  VLOG(5) << "A new prefetch thread starts.";
-  size_t cached_tensor_id = 0;
-  while (true) {
-    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
-    reader_->ReadNext(&cpu_batch);
-    if (cpu_batch.empty()) {
-      // The underlying reader have no next data.
-      break;
-    }
-    if (platform::is_gpu_place(place_)) {
-      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
-      gpu_batch.resize(cpu_batch.size());
-      for (size_t i = 0; i < cpu_batch.size(); ++i) {
-        // TODO(fengjiayi): Use asynchronous TensorCopy instead
-        framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
-        gpu_batch[i].set_lod(cpu_batch[i].lod());
-      }
-    }
-    if (!channel_->Send(cached_tensor_id)) {
-      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
-                 "prefetch thread will terminate.";
-      break;
-    }
-    ++cached_tensor_id;
-    cached_tensor_id %= kCacheSize;
-  }
-  channel_->Close();
-  VLOG(5) << "Prefetch thread terminates.";
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 833776f56eef0ffb2ae5e963919f0482bcd511b8..0f31ca1a94326956ae5e6dffd582daedeb55a9e3 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -33,6 +33,8 @@ class PyReader : public framework::FileReader {
     if (!success) out->clear();
   }
 
+  ~PyReader() { queue_->Close(); }
+
   void Shutdown() override { queue_->Close(); }
 
   void Start() override { queue_->ReOpen(); }
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index b32f09b22524c8b67ce57cc6022ef46efc2e828d..a08a9dbd0da46e73082cdd24c019e8d210d8bcc4 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -33,11 +33,14 @@ class RecordIOFileReader : public framework::FileReader {
 
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    std::unique_ptr<std::lock_guard<std::mutex>> guard;
     if (ThreadSafe) {
-      std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
-    } else {
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
+      guard.reset(new std::lock_guard<std::mutex>(*mutex_));
+    }
+
+    bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
+    if (!ok) {
+      out->clear();
     }
   }
 
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 4b308abc290c10a8a5846672e719b503dfc79b21..3f72890a7cee1453585d50afa04fa62a9b059dc3 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -48,9 +48,9 @@ class ShuffleReader : public framework::DecoratedReader {
 
  private:
   void ShutdownImpl() override {
+    reader_->Shutdown();
     buffer_.clear();
     iteration_pos_ = 0;
-    reader_->Shutdown();
   }
 
   void StartImpl() override {
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 9a8d203672fa2d560440d063d93fa5f8523690ef..38223e069975a08791d58d6ae10e2112b79a61fe 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,150 +12,200 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cmath>
+#include <stdexcept>
 #include <thread>  // NOLINT
-
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
 
-class MultiFileReader : public framework::ReaderBase {
+class IReaderContainer {
  public:
-  MultiFileReader(const std::vector<std::string>& file_names, size_t thread_num,
-                  size_t buffer_size)
-      : buffer_size_(buffer_size) {
-    readers_.reserve(file_names.size());
-    for (const std::string& f_name : file_names) {
-      readers_.emplace_back(CreateReaderByFileName(f_name));
+  virtual ~IReaderContainer() {}
+  virtual void AppendReader(
+      std::unique_ptr<framework::ReaderBase>&& readers) = 0;
+  virtual void Stop() = 0;
+  virtual void Start() = 0;
+  virtual void ReadNext(std::vector<framework::LoDTensor>* out) = 0;
+};
+
+class OrderedReaderContainer : public IReaderContainer {
+ public:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace(std::move(reader));
+  }
+
+  void Stop() override {
+    while (!pending_.empty()) {
+      MoveFrontPendingToDone();
     }
-    prefetchers_.resize(thread_num);
-    StartNewScheduler();
   }
 
-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+  void Start() override { std::swap(done_, pending_); }
 
-  ~MultiFileReader() { EndScheduler(); }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      pending_.front()->ReadNext(out);
+      if (out->empty()) {
+        MoveFrontPendingToDone();
+        ReadNext(out);
+      }
+    } else {
+      out->clear();
+    }
+  }
 
  private:
-  void ShutdownImpl() override { EndScheduler(); }
-
-  void StartImpl() override { StartNewScheduler(); }
-
-  void StartNewScheduler();
-  void EndScheduler();
-  void ScheduleThreadFunc();
-  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
-
-  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
-  std::thread scheduler_;
-  std::vector<std::thread> prefetchers_;
-  size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_reader_idx_;
-  reader::BlockingQueue<size_t>* available_thread_idx_;
-  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
+  void MoveFrontPendingToDone() {
+    pending_.front()->Shutdown();
+    pending_.front()->Start();
+    done_.emplace(move(pending_.front()));
+    pending_.pop();
+  }
+
+  std::queue<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::queue<std::unique_ptr<framework::ReaderBase>> done_;
 };
 
-void MultiFileReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
-  if (!buffer_->Receive(out)) {
-    out->clear();
-  }
-}
+class PreemptiveReaderContainer : public IReaderContainer {
+  using ReaderList = std::list<std::unique_ptr<framework::ReaderBase>>;
 
-void MultiFileReader::StartNewScheduler() {
-  size_t thread_num = prefetchers_.size();
-  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
-  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
-  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
-      buffer_size_);
+  struct FutureItem {
+    std::vector<framework::LoDTensor> data_;
+    ReaderList::iterator reader_it_;
+    std::exception_ptr exception_;
+  };
 
-  for (size_t i = 0; i < readers_.size(); ++i) {
-    waiting_reader_idx_->Send(i);
-  }
-  waiting_reader_idx_->Close();
-  for (size_t i = 0; i < thread_num; ++i) {
-    available_thread_idx_->Send(i);
-  }
+  using FutureList = std::list<std::future<FutureItem>>;
 
-  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
-}
+ public:
+  explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {}
 
-void MultiFileReader::EndScheduler() {
-  available_thread_idx_->Close();
-  buffer_->Close();
-  waiting_reader_idx_->Close();
-  if (scheduler_.joinable()) {
-    scheduler_.join();
-  }
-  delete buffer_;
-  delete available_thread_idx_;
-  delete waiting_reader_idx_;
-}
-
-void MultiFileReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultiFileReader schedule thread starts.";
-  size_t completed_thread_num = 0;
-  size_t thread_idx;
-  while (available_thread_idx_->Receive(&thread_idx)) {
-    std::thread& prefetcher = prefetchers_[thread_idx];
-    if (prefetcher.joinable()) {
-      prefetcher.join();
-    }
-    size_t reader_idx;
-    if (waiting_reader_idx_->Receive(&reader_idx)) {
-      // Still have files to read. Start a new prefetch thread.
-      prefetcher = std::thread([this, reader_idx, thread_idx] {
-        PrefetchThreadFunc(reader_idx, thread_idx);
-      });
-    } else {
-      // No more file to read.
-      ++completed_thread_num;
-      if (completed_thread_num == prefetchers_.size()) {
-        buffer_->Close();
-        break;
+  void Stop() override {
+    if (!pending_.empty()) {
+      for (auto& reader : pending_) {
+        reader->Shutdown();
+      }
+      for (auto& fu : futures_) {
+        fu.wait();
       }
+      futures_.clear();
+      for (auto& reader : pending_) {
+        reader->Start();
+        done_.emplace_back(std::move(reader));
+      }
+      pending_.clear();
+      bool timeout;
+      complete_queue_.PopAll(1000, &timeout);
+      PADDLE_ENFORCE(!timeout);
     }
   }
-  // If users invoke Shutdown() when scheduler is running, it will close the
-  // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
-  // to release their resource. So a check is needed before scheduler ends.
-  for (auto& p : prefetchers_) {
-    if (p.joinable()) {
-      p.join();
+
+  void Start() override {
+    for (auto& reader : done_) {
+      AppendReader(std::move(reader));
     }
+    done_.clear();
   }
-  VLOG(5) << "MultiFileReader schedule thread terminates.";
-}
-
-void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
-  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
-  while (true) {
-    std::vector<framework::LoDTensor> ins;
-    reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->Shutdown();
-      reader->Start();
-      break;
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      auto future_it = complete_queue_.Pop();
+      FutureItem item = future_it->get();
+      if (item.exception_) {
+        for (auto it = futures_.begin(); it != futures_.end(); ++it) {
+          if (it != future_it) {
+            it->wait();  // Wait all other threads complete.
+          }
+        }
+        std::rethrow_exception(item.exception_);
+
+      } else if (item.data_.empty()) {  // reader done.
+        done_.emplace_back(std::move(*item.reader_it_));
+        pending_.erase(item.reader_it_);
+        futures_.erase(future_it);
+        ReadNext(out);
+      } else {
+        *out = item.data_;
+        // continue read async
+        ReadAsync(item.reader_it_, &future_it);
+      }
+    } else {
+      out->clear();
     }
-    try {
-      buffer_->Send(std::move(ins));
-    } catch (paddle::platform::EnforceNotMet e) {
-      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file idx '"
-              << reader_idx << "' will terminate.";
-      break;
+  }
+
+ private:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace_back(std::move(reader));
+    auto reader_it = pending_.end();
+    --reader_it;
+
+    futures_.emplace_back();
+    auto future_it = futures_.end();
+    --future_it;
+
+    ReadAsync(reader_it, &future_it);
+  }
+
+  void ReadAsync(const ReaderList::iterator& reader_it,
+                 FutureList::iterator* future_it_ptr) {
+    auto& future_it = *future_it_ptr;
+    *future_it = pool_.enqueue([reader_it, future_it, this] {
+      try {
+        FutureItem item;
+        item.reader_it_ = reader_it;
+        (*reader_it)->ReadNext(&item.data_);
+        if (item.data_.empty()) {
+          (*reader_it)->Shutdown();
+          (*reader_it)->Start();
+        }
+        complete_queue_.Push(future_it);
+        return item;
+      } catch (...) {
+        FutureItem item;
+        item.exception_ = std::current_exception();
+        complete_queue_.Push(future_it);
+        return item;
+      }
+    });
+  }
+
+  FutureList futures_;
+  ThreadPool pool_;
+  framework::BlockingQueue<FutureList::iterator> complete_queue_;
+  std::list<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::list<std::unique_ptr<framework::ReaderBase>> done_;
+};
+
+class MultiFileReader : public framework::ReaderBase {
+ public:
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  std::unique_ptr<IReaderContainer>&& container)
+      : container_(std::move(container)) {
+    for (auto& fn : file_names) {
+      container_->AppendReader(CreateReaderByFileName(fn));
     }
   }
 
-  if (!available_thread_idx_->Send(thread_idx)) {
-    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
-               "Fail to send thread_idx.";
+  ~MultiFileReader() { container_->Stop(); }
+
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    container_->ReadNext(out);
   }
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
-          << "' terminates.";
-}
+  void ShutdownImpl() override { container_->Stop(); }
+  void StartImpl() override { container_->Start(); }
+
+ private:
+  std::unique_ptr<IReaderContainer> container_;
+};
 
 class OpenFilesOp : public framework::OperatorBase {
  public:
@@ -173,13 +223,27 @@ class OpenFilesOp : public framework::OperatorBase {
                       "shape concat's length.");
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
     PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
-    const size_t thread_num = Attr<int>("thread_num");
-    const size_t buffer_size = Attr<int>("buffer_size");
+    bool is_test = Attr<bool>("is_test");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(
-        std::make_shared<MultiFileReader>(file_names, thread_num, buffer_size));
+    std::unique_ptr<IReaderContainer> container;
+
+    if (is_test) {
+      container.reset(new OrderedReaderContainer());
+    } else {
+      container.reset(new PreemptiveReaderContainer(
+          static_cast<size_t>(Attr<int>("thread_num"))));
+    }
+
+    std::shared_ptr<framework::ReaderBase> reader(
+        new MultiFileReader(file_names, std::move(container)));
+    auto buffer_size = Attr<int>("buffer_size");
+    if (buffer_size > 1) {
+      reader = framework::MakeDecoratedReader<BufferedReader>(
+          reader, platform::CPUPlace(), buffer_size);
+    }
+    out->Reset(reader);
   }
 };
 
@@ -187,9 +251,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
-    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
-        .GreaterThan(0);
-    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
+    AddAttr<bool>("is_test", "Used for testing data.").SetDefault(false);
 
     AddComment(R"DOC(
       OpenFiles Operator
@@ -197,6 +259,11 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
       An OpenFilesOp creates a MultiFileReader, which is able to
       read data multi-threaded from multiple files.
     )DOC");
+    AddAttr<int>("thread_num",
+                 "The maximal concurrent prefetch thread number. Used only "
+                 "when is_test = False");
+    AddAttr<int>("buffer_size", "The reading buffer of these files.")
+        .GreaterThan(0);
   }
 };
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 647cfc0a0af2be85e2868c6f68cab962c6631a8d..43672d6db92a981f0fbe6e8f7079dafc6ae4052e 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
+
+DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+
 namespace operators {
 
 using inference::Singleton;
@@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
                     "TensorRT' tensor input requires at most 4 dimensions");
-
   switch (shape.size()) {
     case 2:
       return nvinfer1::Dims2(shape[0], shape[1]);
@@ -90,27 +92,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
   engine->InitNetwork();
 
   framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  VLOG(4) << "parsed var size " << block.AllVars().size();
   // Add inputs
   VLOG(4) << "declare inputs";
   for (auto &input : context.Inputs("Xs")) {
     VLOG(4) << "declare input " << input;
     auto *var = block.FindVar(input);
+    // TensorRT engine need to create parameters. The parameter's description
+    // should be set in
+    PADDLE_ENFORCE(var, "no variable called %s", input);
     PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                       "TensorRT engine only takes LoDTensor as input");
     auto shape = var->GetShape();
+    // For the special batch_size placeholder -1, drop it and pass the real
+    // shape of data.
+    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+    // variational batch size.
+    if (shape[0] == -1) {
+      shape[0] = FLAGS_tensorrt_engine_batch_size;
+    }
     engine->DeclareInput(
         input, FluidDataType2TRT(
                    var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(var->GetShape()));
+        Vec2TRT_Dims(shape));
   }
 
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
       block_desc, parameters, context.scope(), engine);
 
   // Add outputs
-  VLOG(4) << "declare outputs";
   for (auto &output : context.Outputs("Ys")) {
-    VLOG(4) << "declare output " << output;
     engine->DeclareOutput(output);
   }
 
@@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
+// A trick to compile with the needed TensorRT op converter.
+USE_TRT_CONVERTER(mul)
+
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 1602a913aeebe43fabe2f9c9036edd18ac4c70fd..a332d70030ffa6a033f6b2b33487a4fd279b7016 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -24,6 +24,9 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
+
+DECLARE_int32(tensorrt_engine_batch_size);
+
 namespace operators {
 
 using inference::Singleton;
@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    // Try to determine a batch_size
-    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
-        context.scope(), input_names.front());
-    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
+                      context.Attr<int>("max_batch"));
 
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       }
     }
     // Execute the engine.
-    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine->Execute(batch_size);
+    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
+    engine->Execute(FLAGS_tensorrt_engine_batch_size);
     // Convert output tensor from engine to fluid
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
@@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto* fluid_v = context.scope().FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      fluid_t->Resize(framework::make_ddim(ddim));
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
-      if (platform::is_cpu_place(fluid_t->place())) {
-        // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
-            size * sizeof(float));
-      } else {
-        engine->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-            size * sizeof(float));
-      }
+      fluid_t->Resize(framework::make_ddim(ddim));
+
+      // TODO(Superjomn) find some way to determine which device to output the
+      // tensor.
+      // if (platform::is_cpu_place(fluid_t->place())) {
+      // TODO(Superjomn) change this float to dtype size.
+      engine->GetOutputInCPU(y,
+                             fluid_t->mutable_data<float>(platform::CPUPlace()),
+                             size * sizeof(float));
+      //} else {
+      // engine->GetOutputInGPU(
+      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+      // size * sizeof(float));
+      //}
     }
 
     cudaStreamSynchronize(*engine->stream());
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index 06a13e6c5b6ea76456e231e3f7b1eb33492b16ea..a0a2f984228db0e7a015630655a3176aa4d1a5a4 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -28,6 +28,7 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
 
 Scanner::Scanner(const std::string &filename)
     : stream_(new std::ifstream(filename)), parser_(*stream_) {
+  PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
   Reset();
 }
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9837117fbbc0b4218dc89fc5de0c176abcd32b83..9e58a39eb0939fa15e9c19e1e6fc89a6f99d9a0c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -333,8 +333,7 @@ function assert_api_not_changed() {
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate
 
-    # Use git diff --name-only HEAD^ may not get file changes for update commits in one PR
-    API_CHANGE=`echo $CHANGED_FILES | grep "paddle/fluid/API.spec" || true`
+    API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
     echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
     if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
         # TODO: curl -H 'Authorization: token ${TOKEN}'
@@ -600,11 +599,11 @@ function main() {
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
         build
-        assert_api_not_changed
         run_test
         gen_capi_package
         gen_fluid_inference_lib
         test_fluid_inference_lib
+        assert_api_not_changed
         ;;
       *)
         print_usage
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 782aa933f2ee86274e800045c9356d8072915fc1..79a11a30d602cb33c2583873e0d0f2e15e0fcb8c 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -25,9 +25,6 @@ import numpy
 __all__ = [
     'split_lod_tensor',
     'merge_lod_tensor',
-    'BlockGuard',
-    'BlockGuardWithCompletion',
-    'WhileGuard',
     'While',
     'Switch',
     'lod_rank_table',
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 0665c09bfb52c932219be68ca801cfa951d672d3..07a7ef15ac5cfd39b4cdb30bcff95fc499ae50ae 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -12,14 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import multiprocessing
+import threading
 
-from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
-from ..unique_name import generate as unique_name
+from ..data_feeder import DataFeeder
 from control_flow import BlockGuard
-from ..layer_helper import LayerHelper
+from layer_function_generator import templatedoc
+from .. import core
 from ..executor import global_scope
-from layer_function_generator import generate_layer_fn, templatedoc
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
+    default_startup_program, program_guard, Program
+from ..layer_helper import LayerHelper
+from ..unique_name import generate as unique_name
 
 __all__ = [
     'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
@@ -445,7 +449,12 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def py_reader(capacity, shapes, dtypes, lod_levels=None):
+def py_reader(capacity,
+              shapes,
+              dtypes,
+              lod_levels=None,
+              name=None,
+              use_double_buffer=True):
     """
     Create a reader and blocking queue for data feeding in Python
     
@@ -458,10 +467,13 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
     using `close()` method when unused.
 
     Args:
+       use_double_buffer(bool): Whether use double buffer or not.
        capacity(int): The maximum capacity of the BlockingQueue.
-       shapes(list): List of tuples which declaring data shapes.
-       dtypes(list): List of strs which declaring data type. 
-       lod_levels(list): List of ints which declaring data lod_level.
+       shapes(list|tuple): List of tuples which declaring data shapes.
+       dtypes(list|tuple): List of strs which declaring data type.
+       lod_levels(list|tuple): List of ints which declaring data lod_level.
+       name(basestring): The prefix Python queue name and Reader name. None will
+            be generated automatically.
 
     Returns:
        tuple(Variable, BlockingQueue):
@@ -502,15 +514,23 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
     if lod_levels is None:
         lod_levels = [0] * len(shapes)
 
-    queue_name = unique_name('lod_tensor_blocking_queue')
+    if name is None:
+        queue_name = unique_name('lod_tensor_blocking_queue')
+        reader_name = unique_name('create_py_reader')
+        double_buffer_name = unique_name('double_buffer')
+    else:
+        queue_name = "_".join([name, "queue"])
+        reader_name = "_".join([name, "reader"])
+        double_buffer_name = "_".join([name, "double_buffer"])
+
     var = global_scope().var(queue_name)
     feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
 
     startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=unique_name('create_py_reader'))
+    startup_var = startup_blk.create_var(name=reader_name)
     startup_blk.append_op(
         type='create_py_reader',
-        inputs={'blocking_queue': queue_name},
+        inputs={'blocking_queue': [queue_name]},
         outputs={'Out': [startup_var]},
         attrs={
             'shape_concat': shape_concat,
@@ -524,17 +544,96 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    return monkey_patch_reader_methods(main_prog_var), feed_queue
+    reader = monkey_patch_reader_methods(main_prog_var)
+    if use_double_buffer:
+        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
+        # we return a double buffer reader. However, the reset method comes from
+        # py_reader.
+        double_buffer_reader.reset = reader.reset
+        reader = double_buffer_reader
+
+    # monkey patch py_reader special methods
+    reader.queue = feed_queue
+    current_reset_method = reader.reset
+    reader.thread = None
+    reader.tensor_provider = None
+    reader.exited = False
+
+    def start_provide_thread(func):
+        def __provider_thread__():
+            for tensors in func():
+                array = core.LoDTensorArray()
+                for item in tensors:
+                    if not isinstance(item, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(item, core.CPUPlace())
+                        item = tmp
+
+                    array.append(item)
+
+                if reader.exited:
+                    break
+                feed_queue.push(array)
+                if reader.exited:
+                    break
+            feed_queue.close()
+
+        reader.thread = threading.Thread(target=__provider_thread__)
+        reader.thread.start()
+
+    def __set_tensor_provider__(func):
+        reader.tensor_provider = func
+
+    def __set_paddle_reader__(paddle_reader):
+        with program_guard(Program(), Program()):
+            feed_list = []
+            counter = 0
+            for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
+                name = str(counter)
+                feed_list.append(
+                    data(
+                        name=name,
+                        dtype=dtype,
+                        shape=shape,
+                        lod_level=lod_level))
+                counter += 1
+
+            feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(
+                paddle_reader, multi_devices=False)
+
+        def __tensor_provider__():
+            for slots in paddle_reader():
+                yield [slots[str(idx)] for idx in xrange(counter)]
+
+        __set_tensor_provider__(__tensor_provider__)
+
+    def __reset__():
+        current_reset_method()
+        if reader.thread is not None and reader.tensor_provider is not None:
+            reader.exited = True
+            reader.thread.join()
+            reader.exited = False
+
+    def __start__():
+        start_provide_thread(reader.tensor_provider)
+
+    reader.reset = __reset__
+    reader.decorate_tensor_provider = __set_tensor_provider__
+    reader.decorate_paddle_reader = __set_paddle_reader__
+    reader.start = __start__
+
+    return reader
 
 
 def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num=1,
+               thread_num=None,
                buffer_size=None,
                pass_num=1,
-               for_parallel=True):
+               is_test=None):
     """
     Open files
 
@@ -547,14 +646,14 @@ def open_files(filenames,
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
-       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
-            buffer size will be thread_num * 3.
-            Default: None
+       thread_num(None): The number of thread to read files.
+            Default: min(len(filenames), cpu_number).
+       buffer_size(None): The buffer size of reader. Default: 3 * thread_num
        pass_num(int): Number of passes to run.
-       for_parallel(Bool): Set it as True if you are going to run 
-            subsequent operators in parallel.
-            Default: True
+       is_test(bool|None): Whether `open_files` used for testing or not. If it
+            is used for testing, the order of data generated is same as the file
+            order. Otherwise, it is not guaranteed the order of data is same
+            between every epoch. [Default: False].
 
     Returns:
        Variable: A Reader Variable via which we can get file data.
@@ -566,15 +665,21 @@ def open_files(filenames,
                                                      './data2.recordio'],
                                              shapes=[(3,224,224), (1)],
                                              lod_levels=[0, 0],
-                                             dtypes=['float32', 'int64'],
-                                             thread_num=2,
-                                             buffer_size=2)
+                                             dtypes=['float32', 'int64'])
 
          # Via the reader, we can use 'read_file' layer to get data:
          image, label = fluid.layers.io.read_file(reader)
     """
+    if thread_num is None:
+        thread_num = min(len(filenames), multiprocessing.cpu_count())
+    else:
+        thread_num = int(thread_num)
+
     if buffer_size is None:
-        buffer_size = thread_num * 3
+        buffer_size = 3 * thread_num
+    else:
+        buffer_size = int(buffer_size)
+
     if isinstance(filenames, basestring):
         filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
@@ -588,17 +693,18 @@ def open_files(filenames,
     multi_file_reader_name = unique_name('multi_file_reader')
     startup_blk = default_startup_program().current_block()
     startup_reader = startup_blk.create_var(name=multi_file_reader_name)
+    attrs = {
+        'shape_concat': shape_concat,
+        'lod_levels': lod_levels,
+        'ranks': ranks,
+        'file_names': filenames,
+        'thread_num': thread_num,
+        'buffer_size': buffer_size
+    }
+    if is_test is not None:
+        attrs['is_test'] = is_test
     startup_blk.append_op(
-        type='open_files',
-        outputs={'Out': [startup_reader]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks,
-            'file_names': filenames,
-            'thread_num': thread_num,
-            'buffer_size': buffer_size
-        })
+        type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs)
 
     startup_reader.desc.set_dtypes(dtypes)
     startup_reader.persistable = True
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 194a16b123c441ac1318b8ce58158f67e2a8093d..e7d7a9e826de95514b6f2e04e7408075ab0b8cb6 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -114,23 +114,13 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
             prediction = network(image, is_infer=True)
             auc_out=fluid.layers.auc(input=prediction, label=label)
     """
-
-    warnings.warn(
-        "This interface is not recommended, fluid.layers.auc compute the auc at every minibatch, \
-        but can not aggregate them and get the pass AUC, because pass \
-        auc can not be averaged with weighted from the minibatch auc value. \
-        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
-        which can get every minibatch and every pass auc value.", Warning)
     helper = LayerHelper("auc", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    topk_out, topk_indices = nn.topk(input, k=k)
-    auc_out = helper.create_tmp_variable(dtype="float32")
+    auc_out = helper.create_tmp_variable(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(persistable=True)
-    tn = helper.create_global_variable(persistable=True)
-    fp = helper.create_global_variable(persistable=True)
-    fn = helper.create_global_variable(persistable=True)
+    tp = helper.create_global_variable(persistable=True, dtype='int64')
+    tn = helper.create_global_variable(persistable=True, dtype='int64')
+    fp = helper.create_global_variable(persistable=True, dtype='int64')
+    fn = helper.create_global_variable(persistable=True, dtype='int64')
     for var in [tp, tn, fp, fn]:
         helper.set_variable_initializer(
             var, Constant(
@@ -139,8 +129,7 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
     helper.append_op(
         type="auc",
         inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
+            "Predict": [input],
             "Label": [label],
             "TP": [tp],
             "TN": [tn],
@@ -156,4 +145,4 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
             "FPOut": [fp],
             "FNOut": [fn]
         })
-    return auc_out
+    return auc_out, [tp, tn, fp, fn]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56124663929d1e33b7144ab57ae3b3c55e1652b3..ab40d0c217f565493b30d9a4cb3a600863122bc7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -166,7 +166,8 @@ def fc(input,
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
         is_test(bool): A flag indicating whether execution is in test phase.
         use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 17bb0826a6ea86c98a069263dfab84b99e1177ad..b37b09ac81687882443c948569d9c4fca9310f78 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -591,7 +591,7 @@ class Auc(MetricBase):
                       for i in range(self._num_thresholds - 2)]
         thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
-        # caculate TP, FN, TN, FP count
+        # calculate TP, FN, TN, FP count
         for idx_thresh, thresh in enumerate(thresholds):
             tp, fn, tn, fp = 0, 0, 0, 0
             for i, lbl in enumerate(labels):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7fc8e106fb43666be9c1ea245994dc1c7ac85d7d..3fe99f55011ab7f745c3ad98ec44dfe277a13e05 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer):
 
         & if (use\_nesterov):
 
-        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
 
         & else:
 
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a7dbf8106268582c9c771c7ab9e2a7be7a2f313
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.fluid as fluid
+import paddle.v2
+
+
+def network(is_train):
+    reader = fluid.layers.py_reader(
+        capacity=10,
+        shapes=((-1, 784), (-1, 1)),
+        dtypes=('float32', 'int64'),
+        name="train_reader" if is_train else "test_reader")
+    img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+
+    for i in xrange(2):
+        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
+        hidden = fluid.layers.dropout(
+            hidden, dropout_prob=0.5, is_test=not is_train)
+
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    return fluid.layers.mean(loss), reader
+
+
+def main():
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            loss, train_reader = network(True)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+    test_prog = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_prog, test_startup):
+        with fluid.unique_name.guard():
+            test_loss, test_reader = network(False)
+
+    fluid.Executor(fluid.CUDAPlace(0)).run(startup_prog)
+    fluid.Executor(fluid.CUDAPlace(0)).run(test_startup)
+
+    trainer = fluid.ParallelExecutor(
+        use_cuda=True, loss_name=loss.name, main_program=train_prog)
+
+    tester = fluid.ParallelExecutor(
+        use_cuda=True, share_vars_from=trainer, main_program=test_prog)
+
+    train_reader.decorate_paddle_reader(
+        paddle.v2.reader.shuffle(
+            paddle.batch(mnist.train(), 512), buf_size=8192))
+
+    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+
+    for epoch_id in xrange(10):
+        train_reader.start()
+        try:
+            while True:
+                print 'train_loss', numpy.array(
+                    trainer.run(fetch_list=[loss.name]))
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_reader.reset()
+
+        test_reader.start()
+        try:
+            while True:
+                print 'test loss', numpy.array(
+                    tester.run(fetch_list=[test_loss.name]))
+        except fluid.core.EOFException:
+            print 'End of testing'
+            test_reader.reset()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
index 9425d472a48056e71da5da364f659971ef6c2520..8244617711138d590193b2898de5d2f3aeb1e11e 100644
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
@@ -31,7 +31,10 @@ def load_vocab(filename):
 
 
 # load word dict with paddle inner function
-word_dict = load_vocab(sys.argv[1])
+if len(sys.argv) == 1:
+    word_dict = paddle.dataset.imdb.word_dict()
+else:
+    word_dict = load_vocab(sys.argv[1])
 word_dict["<unk>"] = len(word_dict)
 print "Dict dim = ", len(word_dict)
 
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py
index e408684c6e0941a1b317ffeac66f071c1382836d..281c2869d642c7fe41a386c42208ca2da1dc2891 100644
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
@@ -41,16 +41,14 @@ def network_cfg(is_train, pass_num=100):
             pass_num=pass_num,
             shapes=[[-1, 1], [-1, 1]],
             lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
+            dtypes=['int64', 'int64'])
 
         test_file_obj = fluid.layers.open_files(
             filenames=TEST_FILES,
             pass_num=1,
             shapes=[[-1, 1], [-1, 1]],
             lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
+            dtypes=['int64', 'int64'])
 
         if is_train:
             file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8c564abf986d351b31e993b62bfe1f17c52a4e10..322d76515e76c3d322ac7c4f989bbc95875cb654 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -48,6 +48,7 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -60,3 +61,4 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..72bc1729b0f63b23ad7ecb5ad703b984a4c614ac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -0,0 +1,350 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import sys
+import signal
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu')
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid')
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def get_model(batch_size):
+    # Input data
+    image = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    # Train program
+    model = SE_ResNeXt(layers=50)
+    out = model.net(input=image, class_dim=102)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    # Evaluator
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    # Optimization
+    total_images = 6149  # flowers
+    epochs = [30, 60, 90]
+    step = int(total_images / batch_size + 1)
+
+    bd = [step * e for e in epochs]
+    base_lr = 0.1
+    lr = []
+    lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    optimizer.minimize(avg_cost)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test(), batch_size=batch_size)
+
+    return test_program, avg_cost, train_reader, test_reader, acc_top1, out
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistSeResneXt2x2:
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model(batch_size=2)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            num_trainers=trainers,
+            trainer_id=trainer_id)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            loss, = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+    model = DistSeResneXt2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 6bd5e2332a99693f5e53e147491aa83c35859548..6580c70ca68c4ba24919f03d071f6f88fb68953c 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -15,13 +15,13 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.fluid import metrics
 
 
 class TestAucOp(OpTest):
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
-        indices = np.random.randint(0, 2, (128, 2))
         labels = np.random.randint(0, 2, (128, 1))
         num_thresholds = 200
         tp = np.zeros((num_thresholds, )).astype("int64")
@@ -30,8 +30,7 @@ class TestAucOp(OpTest):
         fn = np.zeros((num_thresholds, )).astype("int64")
 
         self.inputs = {
-            'Out': pred,
-            'Indices': indices,
+            'Predict': pred,
             'Label': labels,
             'TP': tp,
             'TN': tn,
@@ -39,57 +38,18 @@ class TestAucOp(OpTest):
             'FN': fn
         }
         self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
-        # NOTE: sklearn use a different way to generate thresholds
-        #       which will cause the result differs slightly:
-        # from sklearn.metrics import roc_curve, auc
-        # fpr, tpr, thresholds = roc_curve(labels, pred)
-        # auc_value = auc(fpr, tpr)
-        # we caculate AUC again using numpy for testing
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                      for i in range(num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
-        # caculate TP, FN, TN, FP count
-        tp_list = np.ndarray((num_thresholds, ))
-        fn_list = np.ndarray((num_thresholds, ))
-        tn_list = np.ndarray((num_thresholds, ))
-        fp_list = np.ndarray((num_thresholds, ))
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if pred[i, 0] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if pred[i, 0] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            tp_list[idx_thresh] = tp
-            fn_list[idx_thresh] = fn
-            tn_list[idx_thresh] = tn
-            fp_list[idx_thresh] = fp
-
-        epsilon = 1e-6
-        tpr = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fn_list + epsilon)
-        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
-        rec = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
+        python_auc = metrics.Auc(name="auc",
+                                 curve='ROC',
+                                 num_thresholds=num_thresholds)
+        python_auc.update(pred, labels)
 
         self.outputs = {
-            'AUC': auc_value,
-            'TPOut': tp_list,
-            'FNOut': fn_list,
-            'TNOut': tn_list,
-            'FPOut': fp_list
+            'AUC': python_auc.eval(),
+            'TPOut': python_auc.tp_list,
+            'FNOut': python_auc.fn_list,
+            'TNOut': python_auc.tn_list,
+            'FPOut': python_auc.fp_list
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index 6d810920d55ccf069ff408c553069e8f5e590271..aa09b0ea445adccae3f741b53850f8182f3270cc 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -142,8 +142,7 @@ class TestDataBalance(unittest.TestCase):
                 filenames=[self.lod_data_file_name],
                 shapes=[[-1, 3], [-1, 1]],
                 lod_levels=[1, 0],
-                dtypes=['float32', 'int32'],
-                thread_num=1)
+                dtypes=['float32', 'int32'])
             ins, label = fluid.layers.read_file(data_reader)
 
             place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
@@ -156,7 +155,7 @@ class TestDataBalance(unittest.TestCase):
                 main_program=main_prog,
                 build_strategy=build_strategy)
 
-            if (parallel_exe.device_count > self.batch_size):
+            if parallel_exe.device_count > self.batch_size:
                 print("WARNING: Unittest TestDataBalance skipped. \
                     For the result is not correct when device count \
                     is larger than batch size.")
@@ -190,3 +189,7 @@ class TestDataBalance(unittest.TestCase):
     def test_all(self):
         self.main()
         self.main_lod()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3e7036f08cb88087ae45fe7d7c7565c102dab8a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import unittest
+import os
+import signal
+import subprocess
+
+
+class TestDistSeResneXt2x2(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def non_test_with_place(self):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
+        local_proc.wait()
+        local_ret = local_proc.stdout.read()
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver()
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        loss_data0 = tr0_proc.stdout.read()
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 49ea18c1691d68e7d1c4093f95f847b72062f0af..e628195e7265ec564bd64a212c4a35fdff495063 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -102,7 +102,7 @@ class TestLearningRateDecay(unittest.TestCase):
 
         exe.run(startup_prog)
 
-        # fluid.memory_optimize(main_prog)
+        fluid.memory_optimize(main_prog)
 
         for step in range(10):
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index aaea9c1809213c5707e8540eebbdd6f269836fdc..c75d3bd276a5b494090c1aa1fea0bb4f2c067173 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5..cb0ea96ff69ce32b0bb1b49f0318c353aa08d388 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -39,17 +39,17 @@ class TestMultipleReader(unittest.TestCase):
         copyfile('./mnist_0.recordio', './mnist_1.recordio')
         copyfile('./mnist_0.recordio', './mnist_2.recordio')
 
-    def main(self, thread_num):
+    def main(self, is_test=False):
         file_list = [
             './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
         ]
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data_files = fluid.layers.open_files(
                 filenames=file_list,
-                thread_num=thread_num,
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
+                dtypes=['float32', 'int64'],
+                is_test=is_test)
             img, label = fluid.layers.read_file(data_files)
 
             if fluid.core.is_compiled_with_cuda():
@@ -71,6 +71,9 @@ class TestMultipleReader(unittest.TestCase):
             self.assertEqual(batch_count, self.num_batch * 3)
 
     def test_main(self):
-        self.main(thread_num=3)  # thread number equals to file number
-        self.main(thread_num=10)  # thread number is larger than file number
-        self.main(thread_num=2)  # thread number is less than file number
+        self.main(is_test=False)
+        self.main(is_test=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 4580ab85e235e9f73d138058ac6a9aa3beb49a54..b21e16439a5070e5f6d763e1617d4cfffe8bd618 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -33,9 +33,7 @@ def simple_fc_net(use_feed):
             filenames=[MNIST_RECORDIO_FILE],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
@@ -61,9 +59,7 @@ def fc_with_batchnorm(use_feed):
             filenames=[MNIST_RECORDIO_FILE],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
 
@@ -102,6 +98,16 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
+    def _init_data(self, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(size=[32, 784]).astype(np.float32)
+        else:
+            img = np.ones(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    # simple_fc
     def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
         if use_cuda and not core.is_compiled_with_cuda():
             return
@@ -109,8 +115,8 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_network_convergence(
             simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
 
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+        img, label = self._init_data()
+
         self.check_network_convergence(
             simple_fc_net,
             feed_dict={"image": img,
@@ -118,6 +124,37 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             use_reduce=use_reduce)
 
+    def check_simple_fc_convergence_with_Reduce(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(
+            simple_fc_net, use_cuda=use_cuda, use_reduce=True)
+        self.check_network_convergence(
+            simple_fc_net,
+            use_cuda=use_cuda,
+            allow_op_delay=True,
+            use_reduce=True)
+
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+
     def test_simple_fc(self):
         # use_cuda
         self.check_simple_fc_convergence(True)
@@ -125,14 +162,15 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc_with_new_strategy(self):
         # use_cuda, use_reduce
-        self.check_simple_fc_convergence(True, True)
-        self.check_simple_fc_convergence(False, True)
+        self.check_simple_fc_convergence_with_Reduce(True)
+        self.check_simple_fc_convergence_with_Reduce(False)
 
-    def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False):
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+
+        img, label = self._init_data(random=False)
+
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
             seed=1000,
@@ -146,8 +184,7 @@ class TestMNIST(TestParallelExecutorBase):
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            use_parallel_executor=True,
-            use_reduce=use_reduce)
+            use_parallel_executor=True)
 
         for p_f in parallel_first_loss:
             self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
@@ -158,32 +195,53 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_simple_fc_parallel_accuracy(True)
         self.check_simple_fc_parallel_accuracy(False)
 
-    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        # use_cuda, use_reduce
-        self.check_simple_fc_parallel_accuracy(True, True)
-        self.check_simple_fc_parallel_accuracy(False, True)
-
-    def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False):
+    def check_batchnorm_fc_convergence(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
+
         self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+
+        img, label = self._init_data()
+
+        self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda)
+
+    def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
         self.check_network_convergence(
+            fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True)
+
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            use_reduce=use_reduce)
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
 
     def test_batchnorm_fc(self):
         self.check_batchnorm_fc_convergence(True)
         self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        # use_cuda, use_reduce
-        self.check_batchnorm_fc_convergence(True, True)
-        self.check_batchnorm_fc_convergence(False, True)
+        self.check_batchnorm_fc_convergence_use_reduce(True)
+        self.check_batchnorm_fc_convergence_use_reduce(False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
index 05715464848d835684dd3cf0e99e5d4dd482e0b6..91b1fd2af7d8aaf85d17965f8b02c35ee3990291 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
@@ -45,12 +45,12 @@ class TestPyReader(unittest.TestCase):
             ) else fluid.CPUPlace()
             executor = fluid.Executor(place)
 
-            data_file, feed_queue = fluid.layers.py_reader(
+            data_file = fluid.layers.py_reader(
                 capacity=self.capacity,
                 dtypes=self.dtypes,
                 lod_levels=self.lod_levels,
                 shapes=self.shapes)
-
+            feed_queue = data_file.queue
             read_out_data = fluid.layers.read_file(data_file)
             self.inputs = []
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 9a5b69eea46e74deeba87aefae4afac84c7745f1..9a379bdbaa7e278879117a8cdc2dddb335a10ca1 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -52,11 +52,13 @@ def simple_fc_net(in_size,
                   batch_size,
                   queue_capacity,
                   use_double_buffer=False):
-    reader, feed_queue = fluid.layers.py_reader(
+    reader = fluid.layers.py_reader(
         capacity=queue_capacity,
         shapes=[[-1, in_size], [-1, 1]],
         lod_levels=[0, 0],
-        dtypes=['float32', 'int64'])
+        dtypes=['float32', 'int64'],
+        use_double_buffer=False)
+    feed_queue = reader.queue
     reader = fluid.layers.batch(reader, batch_size=batch_size)
     if use_double_buffer:
         reader = fluid.layers.double_buffer(reader)