Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_graph_mode

test=develop

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_graph_mode
test=develop
0a885ac1 · Yancey1989 · ca8c77d9 · ce70229b · 0a885ac1 · 0a885ac1
136 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -139,10 +139,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()
 include_directories(${CUDA_INCLUDE_DIRS})

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
        if(NOT CUDNN_MAJOR_VERSION)
            set(CUDNN_VERSION "???")
        else()
+            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
            math(EXPR CUDNN_VERSION
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")

--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -32,4 +32,4 @@ endif()
 add_dependencies(cub extern_cub)
-LIST(APPEND externl_project_dependencies cub)
+LIST(APPEND external_project_dependencies cub)
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -28,4 +28,4 @@ endif()
 add_dependencies(dlpack extern_dlpack)
-LIST(APPEND externl_project_dependencies dlpack)
+LIST(APPEND external_project_dependencies dlpack)
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_VERSION         "0.9")
+SET(NGRAPH_GIT_TAG         "v0.10.1")
-SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
+SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,9 +27,10 @@ add_subdirectory(details)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
@@ -77,7 +78,7 @@ if (WITH_GPU)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
-cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
+cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
@@ -129,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 if(WITH_NGRAPH)
-  if(NOT WIN32)
  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-      shape_inference data_transform lod_tensor profiler ngraph)
+             shape_inference data_transform lod_tensor profiler)
-  endif(NOT WIN32)
 endif(WITH_NGRAPH)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -175,11 +174,7 @@ if(WITH_DISTRIBUTE)
 else()
  if(WITH_NGRAPH)
-    if(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
-      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
-    else(NOT WIN32)
-      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-    endif(NOT WIN32)
  else(WITH_NGRAPH)
    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
  endif(WITH_NGRAPH)
@@ -194,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        fast_threaded_ssa_graph_executor variable_helper)
 if(WITH_PSLIB)
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
 else()
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
 endif(WITH_PSLIB)

--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@@ -15,34 +15,123 @@
 #pragma once
 #include <cstdint>
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace framework {
 template <typename T, size_t N>
 class Array {
-  static_assert(N > 0, "The size of array must be larger than 0");
 public:
-  HOSTDEVICE Array() {}
+  static constexpr size_t kSize = N;
+  HOSTDEVICE inline Array() {}
-  HOSTDEVICE explicit Array(const T &val) {
+  template <typename... Args>
-    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
+    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
+    UnrollVarArgsAssign<T>::Run(data_, val, args...);
  }
-  HOSTDEVICE const T *Get() const { return data_; }
+  HOSTDEVICE inline void Fill(const T &val) {
+    UnrollFillConstant<N>::Run(data_, val);
+  }
-  HOSTDEVICE T *GetMutable() { return data_; }
+  HOSTDEVICE inline const T *Get() const { return data_; }
-  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline T *GetMutable() { return data_; }
-  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
+  // Writing "return data_[i]" would cause compilation warning/error:
+  // "array subscript is above array bound" in Python 35 CI.
+  // It seems that it is a false warning of GCC if we do not check the bounds
+  // of array index. But for better performance, we do not check in operator[]
+  // like what is in STL. If users want to check the bounds, use at() instead
+  HOSTDEVICE inline const T &operator[](size_t i) const {
+    return *advance(data_, i);
+  }
+  HOSTDEVICE inline T &at(size_t i) {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
+  HOSTDEVICE inline const T &at(size_t i) const {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
  HOSTDEVICE constexpr size_t size() const { return N; }
+  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
+    return UnrollCompare<N>::Run(data_, other.data_);
+  }
+  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
+    return !(*this == other);
+  }
 private:
+  template <typename U>
+  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
+    return ptr + i;
+  }
  T data_[N];
 };
+template <typename T>
+class Array<T, 0> {
+ public:
+  static constexpr size_t kSize = 0;
+  HOSTDEVICE inline Array() {}
+  HOSTDEVICE inline void Fill(const T &val) {}
+  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
+  // Add constexpr to GetMutable() cause warning in MAC
+  HOSTDEVICE inline T *GetMutable() { return nullptr; }
+  HOSTDEVICE inline T &operator[](size_t) {
+#ifdef __CUDA_ARCH__
+    static T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+  HOSTDEVICE inline const T &operator[](size_t) const {
+#ifdef __CUDA_ARCH__
+    static const T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
+  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
+  HOSTDEVICE constexpr size_t size() const { return 0; }
+  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
+    return true;
+  }
+  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
+    return false;
+  }
+};
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -304,9 +304,14 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
  // start executing ops in multiple threads
  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    if (debug) {
+      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
+                                    workers[thidx].get()));
+    } else {
      threads.push_back(
          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
    }
+  }
  for (auto& th : threads) {
    th.join();

--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -18,312 +18,159 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-/// @cond HIDDEN
-template <int i>
-Dim<i> make_dim(const int64_t* d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-template <>
-Dim<0> make_dim<0>(const int64_t* d) {
-  return Dim<0>(*d);
-}
-void make_ddim(DDim& ddim, const int64_t* dims, int n) {
-  switch (n) {
-    case 0:
-      ddim = make_dim<0>(dims);
-      break;
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
-  }
-}
-/// @endcond
 DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
+  return DDim(dims.begin(), dims.size());
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
 }
 DDim make_ddim(const std::vector<int64_t>& dims) {
-  DDim result(make_dim(0));
+  return DDim(dims.data(), dims.size());
-  make_ddim(result, &dims[0], dims.size());
-  return result;
 }
 DDim make_ddim(const std::vector<int>& dims) {
-  std::vector<int64_t> res(dims.size());
+  return DDim(dims.data(), dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
 }
-/// @cond HIDDEN
+struct DDimEqualityVisitor {
-// XXX For some reason, putting this in an anonymous namespace causes errors
+  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
-class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
  template <int D>
-  int64_t& operator()(Dim<D>& dim) const {
+  inline bool operator()(const Dim<D>& self) const {
-    return dim[idx_];
+    return UnrollCompare<D>::Run(self.Get(), d_);
  }
- private:
+  const int64_t* d_;
-  int idx_;
 };
-class DynamicConstIndexer : public boost::static_visitor<int64_t> {
+bool DDim::operator==(const DDim& d) const {
- public:
+  return size() == d.size() &&
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
+         this->apply_visitor(DDimEqualityVisitor(d.Get()));
-  template <int D>
-  int64_t operator()(const Dim<D>& dim) const {
-    return dim[idx_];
-  }
- private:
-  int idx_;
-};
-/// @endcond
-int64_t& DDim::operator[](int idx) {
-  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
-}
-int64_t DDim::operator[](int idx) const {
-  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
-int DDim::size() const { return arity(*this); }
+bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
-bool DDim::operator==(DDim d) const {
+struct DDimPlusVisitor {
-  if (var.which() != d.getVar().which()) {
+  explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2)
-    return false;
+      : d1_(d1), d2_(d2) {}
-  } else {
-    std::vector<int64_t> v1 = vectorize(*this);
-    std::vector<int64_t> v2 = vectorize(d);
-    for (unsigned int i = 0; i < v1.size(); i++) {
+  template <int D>
-      if (v1[i] != v2[i]) {
+  inline void operator()(Dim<D>& self) const {
-        return false;
+    UnrollAdd<D>::Run(d1_, d2_, self.GetMutable());
-      }
  }
-    return true;
+  const int64_t* d1_;
-  }
+  const int64_t* d2_;
-}
+};
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-  std::vector<int64_t> v3;
-  assert(v1.size() == v2.size());
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
-  return make_ddim(v3);
+DDim DDim::operator+(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimPlusVisitor(Get(), d.Get()));
+  return ret;
 }
-DDim DDim::operator*(DDim d) const {
+struct DDimMulVisitor {
-  std::vector<int64_t> v1 = vectorize(*this);
+  explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2)
-  std::vector<int64_t> v2 = vectorize(d);
+      : d1_(d1), d2_(d2) {}
-  std::vector<int64_t> v3;
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
-  assert(v1.size() == v2.size());
+    UnrollMul<D>::Run(d1_, d2_, self.GetMutable());
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
  }
-  return make_ddim(v3);
+  const int64_t* d1_;
+  const int64_t* d2_;
+};
+DDim DDim::operator*(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimMulVisitor(Get(), d.Get()));
+  return ret;
 }
 int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
-void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }  // NOLINT
-/// @cond HIDDEN
-struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
-  template <typename T>
-  void operator()(const T& t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-  void operator()(const Dim<0>& t) {}
-};
-/// @endcond
 std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int64_t> result;
+  std::vector<int64_t> result(DDim::kMaxRank);
-  VectorizeVisitor visitor(result);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  boost::apply_visitor(visitor, ddim);
+  result.resize(ddim.size());
  return result;
 }
 // NOTE: framework::vectorize converts to type int64_t
 //       which does not fit cudnn inputs.
 std::vector<int> vectorize2int(const DDim& ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
+  std::vector<int> result(DDim::kMaxRank);
-  std::vector<int> result(temp.begin(), temp.end());
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
  return result;
 }
-struct ProductVisitor : public boost::static_visitor<int64_t> {
+struct ProductVisitor {
  template <int D>
-  int64_t operator()(const Dim<D>& dim) {
+  inline int64_t operator()(const Dim<D>& dim) {
    return product(dim);
  }
 };
 int64_t product(const DDim& ddim) {
-  ProductVisitor visitor;
+  return ddim.apply_visitor(ProductVisitor());
-  return boost::apply_visitor(visitor, ddim);
 }
-struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-  int begin;
-  int end;
-  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_ENFORCE(begin < end,
-                   "Begin index must be less than end index in ddim slice.");
-    PADDLE_ENFORCE(begin >= 0,
-                   "Begin index can't be less than zero in ddim slice.");
-  }
-  template <int S>
-  void operator()(const Dim<S>& dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-  void operator()(const Dim<0>& dim) {
-    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
-  }
-};
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int64_t> vec;
+  PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
-  vec.reserve(end - begin);
+                 "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
-  SliceVectorizeVisitor visitor(vec, begin, end);
+                 begin, end, dim.size());
-  boost::apply_visitor(visitor, dim);
+  // Constructor of DDim would check whether end - begin is valid
-  return make_ddim(vec);
+  return DDim(dim.Get() + begin, end - begin);
 }
-/// \cond HIDDEN
+int arity(const DDim& d) { return d.size(); }
-struct ArityVisitor : boost::static_visitor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-/// \endcond
+struct DDimPrinter {
-int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
-/// \cond HIDDEN
-struct DDimPrinter : boost::static_visitor<void> {
  std::ostream& os;
  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
-  template <typename T>
+  template <int D>
-  void operator()(const T& t) {
+  void operator()(const Dim<D>& t) {
    os << t;
  }
 };
-/// \endcond
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  DDimPrinter printer(os);
+  ddim.apply_visitor(DDimPrinter(os));
-  boost::apply_visitor(printer, ddim);
  return os;
 }
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  int rank = src.size();
+  return DDim({product(slice_ddim(src, 0, num_col_dims)),
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+               product(slice_ddim(src, num_col_dims, src.size()))});
-                    product(slice_ddim(src, num_col_dims, rank))});
 }
-DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
 DDim stride(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+  DDim strides;
+  strides.rank_ = ddim.size();
  strides[ddim.size() - 1] = 1;
  for (int i = ddim.size() - 2; i >= 0; --i) {
    strides[i] = strides[i + 1] * ddim[i + 1];
  }
-  return framework::make_ddim(strides);
+  return strides;
 }
-DDim stride_numel(const framework::DDim& ddim) {
+DDim stride_numel(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+  DDim strides;
+  strides.rank_ = ddim.size();
  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
  for (int i = ddim.size() - 2; i >= 0; --i) {
    strides[i] = strides[i + 1] * ddim[i];
  }
-  return framework::make_ddim(strides);
+  return strides;
 }
 }  // namespace framework

--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,62 +18,145 @@ limitations under the License. */
 #include <stdexcept>
 #include <vector>
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
 namespace paddle {
 namespace framework {
+#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
+  case (rank): {                               \
+    constexpr auto kRank = (rank);             \
+    return (callback);                         \
+  }
+#define PADDLE_VISIT_DDIM(rank, callback)    \
+  switch (rank) {                            \
+    PADDLE_VISIT_DDIM_BASE(0, callback);     \
+    PADDLE_VISIT_DDIM_BASE(1, callback);     \
+    PADDLE_VISIT_DDIM_BASE(2, callback);     \
+    PADDLE_VISIT_DDIM_BASE(3, callback);     \
+    PADDLE_VISIT_DDIM_BASE(4, callback);     \
+    PADDLE_VISIT_DDIM_BASE(5, callback);     \
+    PADDLE_VISIT_DDIM_BASE(6, callback);     \
+    PADDLE_VISIT_DDIM_BASE(7, callback);     \
+    PADDLE_VISIT_DDIM_BASE(8, callback);     \
+    PADDLE_VISIT_DDIM_BASE(9, callback);     \
+    default:                                 \
+      PADDLE_THROW("Invalid rank %d", rank); \
+  }
+template <typename T1, typename T2>
+inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
+  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
+}
 /**
 * \brief A dynamically sized dimension.
 *
 * The number of dimensions must be between [1, 9].
 */
-struct DDim {
+class DDim {
-  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+ public:
-                         Dim<7>, Dim<8>, Dim<9>>
+  constexpr static int kMaxRank = 9;
-      DDimVar;
-  DDimVar var;
+  DDim() : rank_(1) { dim_[0] = 0; }
-  DDim() : var(Dim<1>()) {}
+  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+  DDim(const int* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
+  DDim(const int64_t* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
  template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
+    UnsafeCast<D>() = in;
+  }
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
+      : DDim(init_list.begin(), init_list.size()) {}
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
  template <int D>
-  DDim& operator=(const Dim<D>& in) {
+  inline DDim& operator=(const Dim<D>& dim) {
-    var = in;
+    rank_ = D;
+    UnsafeCast<D>() = dim;
    return *this;
  }
-  int64_t& operator[](int idx);
+  inline int64_t& operator[](int idx) { return dim_[idx]; }
-  int64_t operator[](int idx) const;
+  inline int64_t operator[](int idx) const { return dim_[idx]; }
+  inline int64_t& at(int idx) {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
+  inline int64_t at(int idx) const {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
-    return var.apply_visitor(visitor);
+      Visitor&& visitor) {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
  }
  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
-    return var.apply_visitor(visitor);
+      Visitor&& visitor) const {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
  }
-  DDimVar getVar() { return var; }
+  bool operator==(const DDim& d) const;
+  bool operator!=(const DDim& d) const;
+  DDim operator+(const DDim& d) const;
-  bool operator==(DDim d) const;
+  DDim operator*(const DDim& d) const;
-  bool operator!=(DDim d) const;
+  inline const int64_t* Get() const { return dim_.Get(); }
-  DDim operator+(DDim d) const;
+  inline int64_t* GetMutable() { return dim_.GetMutable(); }
-  DDim operator*(DDim d) const;
+  inline int size() const { return rank_; }
+ private:
+  template <int D>
+  inline Dim<D>& UnsafeCast() {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<void*>(&dim_);
+    return *reinterpret_cast<Dim<D>*>(p);
+  }
+  template <int D>
+  inline const Dim<D>& UnsafeCast() const {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<const void*>(&dim_);
+    return *reinterpret_cast<const Dim<D>*>(p);
+  }
-  int size() const;
+  inline DDim& CopyFrom(const DDim& ddim) {
+    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
+  }
+  friend DDim stride(const DDim& ddim);
+  friend DDim stride_numel(const DDim& ddim);
+ private:
+  Dim<kMaxRank> dim_;
+  int rank_;
 };
+#undef PADDLE_VISIT_DDIM_BASE
+#undef PADDLE_VISIT_DDIM
 /**
 * \brief Make a DDim from std::vector<int64_t>
 *
@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
 DDim make_ddim(std::initializer_list<int64_t> dims);
 int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
+void set(DDim& dim, int idx, int val);  // NOLINT
 std::vector<int64_t> vectorize(const DDim& ddim);
 std::vector<int> vectorize2int(const DDim& ddim);
@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
 DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
-namespace boost {
-template <typename T>
-T get(const paddle::framework::DDim& in) {
-  return boost::get<T>(in.var);
-}
-}  // namespace boost
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -55,9 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
-  // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
-  // this is a distributed or inter-process call, find a better way.
-  // Wait input done
  WaitInputVarGenerated();
  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
  ExecutorType type_{kDefault};
  bool dry_run_{false};
 };

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
  }
  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
-  drop_scope_counter_ += 1;
+  ++drop_scope_counter_;
-  if (!fetch_tensors.empty() ||
+  bool stream_end = false;
-      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+  if (!fetch_tensors.empty()) {
-    drop_scope_counter_ = 0;
+    WaitComputationalStreams();
-    // Wait All computational streams
+    stream_end = true;
-    for (auto p : places_) {
+  }
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
+  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    if (!stream_end) {
+      WaitComputationalStreams();
    }
    for (auto &scope : local_scopes_) {
      auto &local_scope =
          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
      scope->DeleteScope(local_scope);
    }
+    drop_scope_counter_ = 0;
  }
  if (eptr) {
    std::rethrow_exception(eptr);

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
+ private:
+  inline void WaitComputationalStreams() {
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+  }
 private:
  size_t drop_scope_counter_{0};

--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -16,332 +16,184 @@
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
+#include <string>
 #include <type_traits>
+#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 namespace paddle {
 namespace framework {
 // Statically sized, statically indexed dimension
-template <int i>
+template <int D>
-struct Dim {
+class Dim : public Array<int64_t, D> {
-  static constexpr int dimensions = i;
+ public:
+  static_assert(D >= 0, "D must be not less than 0");
-  template <typename... Args>
+  static constexpr int kRank = D;
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
+  using BaseClass = Array<int64_t, D>;
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
-  HOSTDEVICE
+  inline Dim(int64_t head, const Dim<D - 1>& tail) {
-  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+    (*this)[0] = head;
+    new (this->GetMutable() + 1) Dim<D - 1>(tail);
+  }
-  HOSTDEVICE
+  template <typename... Args>
-  Dim() : head(0), tail() {}
+  HOSTDEVICE explicit Dim(int64_t head, Args... args)
+      : BaseClass(head, args...) {}
  /** Construct a Dim from a linear index and size.  Uses Fortran order
   * indexing. */
-  HOSTDEVICE
+  HOSTDEVICE Dim(int64_t idx, const Dim<D>& size);
-  Dim(int64_t idx, const Dim<i>& size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
+  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
-  Dim(int64_t idx) : head(idx), tail(idx) {}
-  HOSTDEVICE
+  HOSTDEVICE Dim() = default;
-  bool operator==(const Dim<i>& o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-  HOSTDEVICE
-  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
  HOST std::string to_string() const;
-  int64_t head;
-  Dim<i - 1> tail;
-};
-// Base case specialization
-template <>
-struct Dim<0> {
-  static constexpr int dimensions = 0;
-  HOSTDEVICE
-  Dim(int64_t _head) {}
-  HOSTDEVICE
-  Dim() {}
-  HOSTDEVICE
-  Dim(int idx, const Dim<0>& size) {
-#ifndef __CUDA_ARCH__
-    if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
-    }
-#else
-    PADDLE_ASSERT(idx == 0);
-#endif
-  }
-  HOSTDEVICE
-  bool operator==(const Dim<0>& o) const { return true; }
-  HOSTDEVICE
-  bool operator!=(const Dim<0>& o) const { return false; }
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
 };
-namespace {
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
-// Helper for accessing Dim classes
+struct FortranOrderIndexingConstructorFunctor {
-template <int i>
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
-struct DimGetter {
+                                    int64_t* out) {
-  // Return a copy if Dim is const
+    out[kStart] = (*idx) % in[kStart];
-  template <typename D>
+    (*idx) /= in[kStart];
-  HOSTDEVICE static int64_t impl(const D& d) {
+    FortranOrderIndexingConstructorFunctor<kStart + 1, kEnd,
-    return DimGetter<i - 1>::impl(d.tail);
+                                           kStart + 1 == kEnd>::Run(in, idx,
-  }
+                                                                    out);
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
  }
 };
-// Eureka! We found the element!
+template <int kStart, int kEnd>
-template <>
+struct FortranOrderIndexingConstructorFunctor<kStart, kEnd, true> {
-struct DimGetter<0> {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
-  // Return a copy if Dim is const
+                                    int64_t* out) {}
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return d.head;
-  }
 };
+}  // namespace detail
 template <int D>
-HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
+HOSTDEVICE Dim<D>::Dim(int64_t idx, const Dim<D>& size) {
-#ifndef __CUDA_ARCH__
+  detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run(
-  if (idx < 0) {
+      size.Get(), &idx, this->GetMutable());
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-template <>
-HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-template <>
-HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l>& d) {
-  return DimGetter<i>::impl(d);
 }
-// Static access to mutable Dim
+template <int idx, int D>
-template <int i, int l>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim) {
-HOSTDEVICE int64_t& get(Dim<l>& d) {
+  return dim[idx];
-  return DimGetter<i>::impl(d);
 }
-// Dynamic access to constant Dim
+template <int idx, int D>
-template <int l>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim) {  // NOLINT
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
+  return dim[idx];
-  return indexer(*this, i);
 }
-// Dynamic access to mutable Dim
+template <int D>
-template <int l>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim, int idx) {
-HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
+  return dim[idx];
-  return indexer(*this, i);
-}
-// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-  return indexer(*this, i);
-}
-// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
-  return indexer(*this, i);
-}
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
-                                                               int i) {
-  return d[i];
 }
-// Dynamic access to mutable Dim
+template <int D>
-template <int l>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim, int idx) {  // NOLINT
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+  return dim[idx];
-                                                                int i) {
-  return d[i];
 }
 // Dot product of two dims
-template <int i>
+template <int D>
-HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE inline int64_t linearize(const Dim<D>& a, const Dim<D>& b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
+  return UnrollProduct<D>::Run(a.Get(), b.Get());
-}
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
-  return 0;
 }
 // Product of a Dim
-template <int i>
+template <int D>
-HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
+HOSTDEVICE inline int64_t product(const Dim<D>& a) {
-  return prod * a.head * product(a.tail);
+  return UnrollProduct<D>::Run(a.Get());
-}
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
-  return prod;
 }
 // Is 0 <= idx_i < size_i for all i?
-template <int i>
+namespace detail {
-HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
+template <int kStart, int kEnd, bool kStop>
-  return ((0 <= idx.head) && (idx.head < size.head) &&
+struct ContainedFunctor {
-          contained(idx.tail, size.tail));
+  HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) {
-}
+    return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) &&
+           ContainedFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(idx,
+                                                                       size);
+  }
+};
-// Base case of is 0 <= idx_i < size_i ?
+template <int kStart, int kEnd>
-// Notice it is inline because it is no longer a template
+struct ContainedFunctor<kStart, kEnd, true> {
-template <>
+  HOSTDEVICE static constexpr inline bool Run(const int64_t* idx,
-HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
+                                              const int64_t* size) {
    return true;
+  }
+};
+}  // namespace detail
+template <int D>
+HOSTDEVICE inline bool contained(const Dim<D>& idx, const Dim<D>& size) {
+  return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get());
 }
 /**
 * \brief Compute exclusive prefix-multiply of a Dim.
 */
-template <int i>
+namespace detail {
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
+template <int kStart, int kEnd, bool kStop>
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
+struct ExPrefixMulFunctor {
-}
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {
+    kStart == 0 ? out[kStart] = 1 : out[kStart] =
+                                        out[kStart - 1] * in[kStart - 1];
+    detail::ExPrefixMulFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(in,
+                                                                          out);
+  }
+};
-///\cond HIDDEN
+template <int kStart, int kEnd>
-// Base case of ex_prefix_mul
+struct ExPrefixMulFunctor<kStart, kEnd, true> {
-// Notice it is inline because it is no longer a template
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {}
-template <>
+};
-HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
+}  // namespace detail
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> ex_prefix_mul(const Dim<D>& src) {
+  Dim<D> ret;
+  detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable());
+  return ret;
 }
-///\endcond
 /**
 * Add two dimensions together
 */
-template <int i>
+template <int D>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE inline Dim<D> dim_plus(const Dim<D>& a, const Dim<D>& b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
+  Dim<D> ret;
-}
+  UnrollAdd<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
 }
-template <int i>
+template <int D>
-HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+HOSTDEVICE inline Dim<D> operator+(const Dim<D>& lhs, const Dim<D>& rhs) {
  return dim_plus(lhs, rhs);
 }
 /**
 * Multiply two dimensions together
 */
-template <int i>
+template <int D>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE inline Dim<D> dim_mult(const Dim<D>& a, const Dim<D>& b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
+  Dim<D> ret;
-}
+  UnrollMul<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
 }
-template <int i>
+template <int D>
-HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+HOSTDEVICE Dim<D> operator*(const Dim<D>& lhs, const Dim<D>& rhs) {
  return dim_mult(lhs, rhs);
 }
@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
 * \return Dim object the same size as \p size with normalized strides
 *
 */
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct NormalizeStridesFunctor {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {
+    ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]);
+    NormalizeStridesFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        size, stride, ret);
+  }
+};
-template <int i>
+template <int kStart, int kEnd>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
+struct NormalizeStridesFunctor<kStart, kEnd, true> {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
+                             int64_t* ret) {}
-}
+};
+}  // namespace detail
-///\cond HIDDEN
-template <>
+template <int D>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
+HOSTDEVICE Dim<D> normalize_strides(const Dim<D>& size, const Dim<D>& stride) {
-                                           const Dim<0>& stride) {
+  Dim<D> ret;
-  return Dim<0>();
+  detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(),
+                                                     ret.GetMutable());
+  return ret;
 }
-///\endcond
 /**
 * Helper function to create a Dim
 *
@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
 */
 template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
  return Dim<sizeof...(Args)>(idxes...);
 }
 // Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
+template <int D>
-template <int i>
+inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
-typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
+  os << d[0];
-    std::ostream& os, const Dim<i>& d) {
+  for (int i = 1; i < D; ++i) {
-  os << d.head << ", " << d.tail;
+    os << ", " << d[i];
-  return os;
+  }
-}
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head;
  return os;
 }
@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
  return os;
 }
-template <int i>
+template <int D>
-HOST std::string Dim<i>::to_string() const {
+HOST std::string Dim<D>::to_string() const {
  std::stringstream stream;
  stream << *this;
  return stream.str();
 }
 template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, const Dim<D>& extents) {
  Dim<D> result;
  for (int i = 0; i < D - 1; ++i) {
@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
  return result;
 }
+template <int D, typename T1, typename T2>
+inline void static_dim_assign(const T1* in, T2* out) {
+  UnrollAssign<D>::Run(in, out);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
 struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  inline ::DLContext operator()(const platform::CPUPlace &place) const {
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLCPU;
    ctx.device_id = 0;
    return ctx;
@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLGPU;
    ctx.device_id = place.device;
    return ctx;
@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLCPUPinned;
    ctx.device_id = 0;
    return ctx;

--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -38,7 +38,7 @@ class DLPackTensor {
  // The shape in DLTensor is defined as int64_t*
  // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[9];
+  ShapeType shape_[DDim::kMaxRank];
 };
 }  // namespace framework

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace framework {
@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
  return;
 #else
  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  LOG(WARNING) << "concurrency capacity " << concurrency_cap;
  int thread_id = this->thread_id_;
  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
  VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }
+void ExecutorThreadWorker::TrainFilesWithTimer() {
+  platform::SetNumThreads(1);
+  SetDevice();
+  thread_reader_->Start();
+  std::vector<double> op_total_time;
+  std::vector<std::string> op_name;
+  for (auto& op : ops_) {
+    op_name.push_back(op->Type());
+  }
+  op_total_time.resize(ops_.size());
+  for (size_t i = 0; i < op_total_time.size(); ++i) {
+    op_total_time[i] = 0.0;
+  }
+  platform::Timer timeline;
+  double total_time = 0.0;
+  double read_time = 0.0;
+  int cur_batch;
+  int batch_cnt = 0;
+  timeline.Start();
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    timeline.Pause();
+    read_time += timeline.ElapsedSec();
+    total_time += timeline.ElapsedSec();
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      timeline.Start();
+      ops_[i]->Run(*thread_scope_, place_);
+      timeline.Pause();
+      op_total_time[i] += timeline.ElapsedSec();
+      total_time += timeline.ElapsedSec();
+    }
+    ++batch_cnt;
+    thread_scope_->DropKids();
+    if (thread_id_ == 0) {
+      if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
+        for (size_t i = 0; i < ops_.size(); ++i) {
+          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
+                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
+        }
+        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
+        int fetch_var_num = fetch_var_names_.size();
+        for (int i = 0; i < fetch_var_num; ++i) {
+          print_fetch_var(thread_scope_, fetch_var_names_[i]);
+        }
+      }
+    }
+    timeline.Start();
+  }
+}
 void ExecutorThreadWorker::TrainFiles() {
  platform::SetNumThreads(1);
@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
    std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
  _pslib_ptr = pslib_ptr;
 }
 void AsyncExecutorThreadWorker::SetPullDenseThread(
    std::shared_ptr<DensePullThread> dpt) {
  _pull_dense_thread = dpt;
 }
 void AsyncExecutorThreadWorker::TrainOneNetwork() {
  PrepareParams();

--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
  // A multi-thread training function
  virtual void TrainFiles();
+  // with timer log
+  virtual void TrainFilesWithTimer();
  // set fetch variable names from python interface assigned by users
  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
 #ifdef PADDLE_WITH_PSLIB

--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
  std::vector<Node*> optimize_ops;
  std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
  std::unordered_set<std::string> grad_names;
+  std::unordered_map<std::string, std::string> gradname2paramname;
  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
  auto origin_nodes = graph->ReleaseNodes();
@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
      auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
      for (size_t i = 0; i < op_role_vars.size(); i += 2) {
        grad_names.insert(op_role_vars[i + 1]);
+        gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
      }
    } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
      lr_ops.push_back(node);
@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
  // 2. copy forward backward
  ir::Node* prev_repeat_last_op_node = nullptr;
-  // record origin_grad -> repeated grad list map.
+  // record origin_grad -> repeated_grad_list map.
  std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
  std::map<std::string, std::vector<ir::Node*>> created;
  std::unordered_set<std::string> bn_vars_need_rename;
@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
        if (grad_names.find(outname) != grad_names.end()) {
          std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
          repeated_op.RenameOutput(outname, new_gname);
+          // remove op_role_var for backward ops that outputs grad for a
+          // parameter.
+          repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                              std::vector<std::string>());
        }
      }
      // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
-      // not need this update
+      // not need this update, because only moving mean and variance should be
+      // differ, trainable parameter scale and bias is the same as other
+      // parameters.
      if (node->Name() == "batch_norm") {
        // NOTE: assume bn op created by layers use save var as output mean and
        // variance
@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
        var->inputs.push_back(repeated_node);
      }
    }
-  }
+  }  // end copy forward backward
-  // 5. create GRAD merge op node
+  // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
+  // scale(1/num_repeats)
  for (auto kv : grad_repeated_map) {
    OpDesc sum_op;
    sum_op.SetType("sum");
    std::vector<std::string> repeated_grad_names;
+    std::vector<std::string> param_grad_op_role_var;
    for (auto r : kv.second) {
      repeated_grad_names.push_back(r->Var()->Name());
    }
+    // NOTE: use op_role_var to control allreduce op appending in
+    //       multi_devices_graph_pass, we want to append op_role_var
+    //       only once for the merged gradient, so break after first call.
+    param_grad_op_role_var.push_back(
+        gradname2paramname.at(kv.first->Var()->Name()));        // param
+    param_grad_op_role_var.push_back(kv.first->Var()->Name());  // grad
    sum_op.SetInput("X", repeated_grad_names);
    sum_op.SetOutput("Out", {kv.first->Var()->Name()});
    sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
    scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                     static_cast<int>(OpRole::kBackward));
+    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                     param_grad_op_role_var);
    auto scale_op_node = result.CreateOpNode(&scale_op);
    scale_op_node->inputs.push_back(sum_out_var_node);
    sum_out_var_node->outputs.push_back(scale_op_node);

--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() {
  BuildNgNodes();
  ngraph_function_ = nullptr;
  ngraph::NodeVector func_outputs;
-  ngraph::op::ParameterVector func_inputs;
+  ngraph::ParameterVector func_inputs;
  for (auto& vo : var_out_) {
    func_outputs.push_back(var_node_map_->at(vo));

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <algorithm>
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -1041,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData(
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
-  auto& scope = ctx.scope();
  int data_type = -1;
-  std::string last_input_name;
  for (auto& input : this->inputs_) {
-    for (auto& ipt_name : input.second) {
+    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
-      auto* var = scope.FindVar(ipt_name);
+    for (size_t i = 0; i < vars.size(); ++i) {
+      const Variable* var = vars[i];
      if (var != nullptr) {
        const Tensor* t = nullptr;
        if (var->IsType<Tensor>()) {
@@ -1057,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &(var->Get<SelectedRows>().value());
        }
        if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
-                         ipt_name);
+                         input.first, i);
          int tmp = static_cast<int>(t->type());
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
+              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
-              Type(), last_input_name, data_type, ipt_name, tmp);
+              Type(), data_type, tmp);
          data_type = tmp;
-          last_input_name = ipt_name;
        }
      }
    }

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -81,6 +81,10 @@ class RuntimeContext {
  RuntimeContext(const VariableNameMap& innames,
                 const VariableNameMap& outnames, const Scope& scope);
+  RuntimeContext(const VariableValueMap& invars,
+                 const VariableValueMap& outvars)
+      : inputs(invars), outputs(outvars) {}
  VariableValueMap inputs;
  VariableValueMap outputs;
 };
@@ -447,8 +451,9 @@ class OperatorWithKernel : public OperatorBase {
  void RuntimeInferShape(const Scope& scope, const platform::Place& place,
                         const RuntimeContext& ctx) const override;
- protected:
  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+ protected:
  virtual OpKernelType GetKernelTypeForVar(
      const std::string& var_name, const Tensor& tensor,
      const OpKernelType& expected_kernel_type) const;

--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #if !defined(_WIN32)
 #include <pthread.h>
+#else
+#include <mutex>  // NOLINT
 #endif            // !_WIN32
 #include "paddle/fluid/platform/enforce.h"
@@ -29,17 +31,17 @@ struct RWLock {
  ~RWLock() { pthread_rwlock_destroy(&lock_); }
-  void RDLock() {
+  inline void RDLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                      "acquire read lock failed");
  }
-  void WRLock() {
+  inline void WRLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                      "acquire write lock failed");
  }
-  void UNLock() {
+  inline void UNLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
  }
@@ -51,81 +53,46 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  void RDLock() {}
+  // FIXME(minqiyang): use mutex here to do fake lock
-  void WRLock() {}
+  inline void RDLock() { mutex_.lock(); }
-  void UNLock() {}
+  inline void WRLock() { mutex_.lock(); }
+  inline void UNLock() { mutex_.unlock(); }
+ private:
+  std::mutex mutex_;
 };
 #endif
-class RWLockGuard {
+class AutoWRLock {
 public:
-  enum Status { kUnLock, kWRLock, kRDLock };
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-  RWLockGuard(RWLock* rw_lock, Status init_status)
-      : lock_(rw_lock), status_(Status::kUnLock) {
-    switch (init_status) {
-      case Status::kRDLock: {
-        RDLock();
-        break;
-      }
-      case Status::kWRLock: {
-        WRLock();
-        break;
-      }
-      case Status::kUnLock: {
-        break;
-      }
-    }
-  }
-  void WRLock() {
+  ~AutoWRLock() { UnLock(); }
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->WRLock();
-        status_ = Status::kWRLock;
-        break;
-      }
-      case Status::kWRLock: {
-        break;
-      }
-      case Status::kRDLock: {
-        PADDLE_THROW(
-            "Please unlock read lock first before invoking write lock.");
-        break;
-      }
-    }
-  }
-  void RDLock() {
+ private:
-    switch (status_) {
+  inline void Lock() { lock_->WRLock(); }
-      case Status::kUnLock: {
-        lock_->RDLock();
-        status_ = Status::kRDLock;
-        break;
-      }
-      case Status::kRDLock: {
-        break;
-      }
-      case Status::kWRLock: {
-        PADDLE_THROW(
-            "Please unlock write lock first before invoking read lock.");
-        break;
-      }
-    }
-  }
-  void UnLock() {
+  inline void UnLock() { lock_->UNLock(); }
-    if (status_ != Status::kUnLock) {
-      lock_->UNLock();
+ private:
-      status_ = Status::kUnLock;
+  RWLock* lock_;
-    }
+};
-  }
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+  ~AutoRDLock() { UnLock(); }
+ private:
+  inline void Lock() { lock_->RDLock(); }
-  ~RWLockGuard() { UnLock(); }
+  inline void UnLock() { lock_->UNLock(); }
 private:
  RWLock* lock_;
-  Status status_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_LOCK_GUARD
+#define SCOPE_KIDS_READER_LOCK
+#define SCOPE_KIDS_WRITER_LOCK
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
 #else
-#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
 #endif
 namespace paddle {
@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
-  SCOPE_LOCK_GUARD
+  Scope* child = new Scope(this);
-  kids_.push_back(new Scope(this));
+  {
-  return *kids_.back();
+    SCOPE_KIDS_WRITER_LOCK
+    kids_.push_back(child);
+  }
+  return *child;
 }
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
-  SCOPE_LOCK_GUARD
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
+  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
  return FindVarInternal(name);
 }
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
  return FindVarLocally(name);
 }
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_READER_LOCK
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_LOCK_GUARD
  std::vector<std::string> known_vars;
+  {
+    SCOPE_VARS_READER_LOCK
    known_vars.reserve(this->vars_.size());
    for (auto& p : vars_) {
      known_vars.emplace_back(p.first);
    }
+  }
  return known_vars;
 }
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                 this, scope);
@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_LOCK_GUARD
  std::set<std::string> var_set(var_names.begin(), var_names.end());
+  SCOPE_VARS_WRITER_LOCK
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
      it = vars_.erase(it);
@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,12 +14,18 @@ limitations under the License. */
 #pragma once
+extern "C" {
+#include <xxhash.h>
+}
 #include <list>
-#include <mutex>  // NOLINT
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
@@ -95,7 +101,14 @@ class Scope {
  std::string Rename(const std::string& origin_name) const;
 protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  struct KeyHasher {
+    std::size_t operator()(const std::string& key) const {
+      return XXH32(key.c_str(), key.size(), 1);
+    }
+  };
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
+      vars_;
 private:
  // Call Scope::NewScope for a sub-scope.
@@ -124,7 +137,8 @@ class Scope {
  DISABLE_COPY_AND_ASSIGN(Scope);
 private:
-  mutable std::mutex mutex_;
+  mutable RWLock kids_lock_;
+  mutable RWLock vars_lock_;
 };
 // Generate some debug string about the inherience structure of scope, quite

--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/fluid/framework/unroll_array_ops.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cstddef>
+#include <type_traits>
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace framework {
+namespace detail {
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollFillConstant {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {
+    data[kStart] = val;
+    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
+  }
+};
+template <size_t kStart, size_t kEnd>
+struct UnrollFillConstant<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {}
+};
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAssign {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
+    d2[kStart] = static_cast<Tout>(d1[kStart]);
+    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+template <size_t kStart, size_t kEnd>
+struct UnrollAssign<kStart, kEnd, true> {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
+};
+template <typename T, size_t kStart, size_t kEnd, bool kStop>
+struct UnrollVarArgsAssignImpl {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
+    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
+    d[kStart] = val;
+    UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        d, args...);
+  }
+};
+template <typename T, size_t kStart, size_t kEnd>
+struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(T *d) {}
+};
+template <typename T>
+struct UnrollVarArgsAssign {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, Args... args) {
+    UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
+        d, args...);
+  }
+};
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollCompare {
+  template <typename T>
+  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
+    return d1[kStart] == d2[kStart] &&
+           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+template <size_t kStart, size_t kEnd>
+struct UnrollCompare<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
+    return true;
+  }
+};
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAdd {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] + d2[kStart];
+    UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+template <size_t kStart, size_t kEnd>
+struct UnrollAdd<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollMul {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] * d2[kStart];
+    UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+template <size_t kStart, size_t kEnd>
+struct UnrollMul<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollProduct {
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d) {
+    return d[kStart] *
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
+  }
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
+    return d1[kStart] * d2[kStart] +
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+template <size_t kStart, size_t kEnd>
+struct UnrollProduct<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d) {
+    return 1;
+  }
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
+    return 0;
+  }
+};
+}  // namespace detail
+template <size_t N>
+using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
+template <size_t N>
+using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
+template <typename T>
+using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
+template <size_t N>
+using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
+template <size_t N>
+using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
+template <size_t N>
+using UnrollMul = detail::UnrollMul<0, N, N == 0>;
+template <size_t N>
+using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+namespace paddle {
+namespace framework {
+template <typename T>
+bool CheckEquality(const T* p, size_t n, T val) {
+  return std::all_of(p, p + n, [val](const T& v) { return v == val; });
+}
+template <int D1, int D2>
+bool FillConstantTestMain() {
+  static_assert(D1 >= D2, "");
+  std::array<int, D1> arr;
+  arr.fill(0);
+  UnrollFillConstant<D2>::Run(arr.data(), 1);
+  return CheckEquality(arr.data(), D2, 1) &&
+         CheckEquality(arr.data() + D2, arr.size() - D2, 0);
+}
+TEST(unroll_ops, fill_constant) {
+  EXPECT_TRUE((FillConstantTestMain<9, 0>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 1>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 4>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 9>()));
+}
+TEST(unroll_ops, assign) {
+  const int a[] = {1, 2, 3, 4, 5};
+  int b[] = {0, 0, 0, 0, 0};
+  UnrollAssign<3>::Run(a, b);
+  EXPECT_EQ(b[0], 1);
+  EXPECT_EQ(b[1], 2);
+  EXPECT_EQ(b[2], 3);
+  EXPECT_EQ(b[3], 0);
+  EXPECT_EQ(b[4], 0);
+}
+TEST(unroll_ops, var_args_assign) {
+  int a[] = {0, 0, 0};
+  UnrollVarArgsAssign<int>::Run(a, 1, 2);
+  EXPECT_EQ(a[0], 1);
+  EXPECT_EQ(a[1], 2);
+  EXPECT_EQ(a[2], 0);
+}
+TEST(unroll_ops, compare) {
+  int a[] = {1, 2, 3};
+  int b[] = {1, 2, 4};
+  EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
+  b[0] = -1;
+  EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
+}
+TEST(unroll_ops, add) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollAdd<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] + b[0], c[0]);
+  EXPECT_EQ(a[1] + b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+TEST(unroll_ops, mul) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollMul<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] * b[0], c[0]);
+  EXPECT_EQ(a[1] * b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+TEST(unroll_ops, product) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
+  EXPECT_EQ(UnrollProduct<3>::Run(a, b),
+            a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -42,13 +42,9 @@ void AddTo(Variable* src, Variable* dst) {
 class Autograd {
 public:
-  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+  Autograd() {}
  void RunBackward(VarBase* var) {
-    PADDLE_ENFORCE(var->pre_op_->op_desc_);
-    // TODO(panyx0718): Only create for vars that "require_grad"
-    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
    std::deque<OpBase*> ready;
    ready.push_back(var->pre_op_);
@@ -57,11 +53,14 @@ class Autograd {
    while (!ready.empty()) {
      OpBase* ready_op = ready.front();
      ready.pop_front();
-      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
+      std::map<std::string, std::vector<VarBase*>> input_grads =
+          ready_op->ApplyGrad();
-      for (size_t i = 0; i < input_grads.size(); ++i) {
-        if (!input_grads[i]) continue;
+      for (auto it : input_grads) {
-        OpBase* pre_op = ready_op->pre_ops_->at(i);
+        const std::vector<VarBase*>& ingrads = it.second;
+        for (size_t i = 0; i < ingrads.size(); ++i) {
+          if (!ingrads[i]) continue;
+          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
          if (!pre_op) continue;
          dep_counts[pre_op] -= 1;
@@ -73,6 +72,7 @@ class Autograd {
        }
      }
    }
+  }
 private:
  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
@@ -85,7 +85,8 @@ class Autograd {
    while (!queue.empty()) {
      OpBase* candidate = queue.front();
      queue.pop_front();
-      for (OpBase* pre_op : *(candidate->pre_ops_)) {
+      for (auto it : candidate->pre_ops_) {
+        for (OpBase* pre_op : it.second) {
          if (!pre_op) continue;
          if (visited.find(pre_op) == visited.end()) {
            visited.insert(pre_op);
@@ -94,129 +95,74 @@ class Autograd {
          ret[pre_op] += 1;
        }
      }
+    }
    return ret;
  }
-  framework::Scope* scope_;
 };
-framework::Variable* CreateVariable(const std::string& name,
-                                    const framework::DDim& dim, float val,
-                                    framework::Scope* scope,
-                                    bool random_name = true) {
-  std::string varname = name;
-  if (random_name) {
-    std::mt19937 rng;
-    rng.seed(std::random_device()());
-    std::uniform_int_distribution<std::mt19937::result_type> dist6(
-        1, std::numeric_limits<int>::max());
-    int id = dist6(rng);
-    varname = string::Sprintf("%s@%d", varname, id);
-  }
-  VLOG(3) << "creating var " << varname;
-  framework::Variable* var = scope->Var(varname);
-  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
-  std::fill(data, data + tensor->numel(), val);
-  return var;
-}
 framework::LoDTensor& VarBase::Grad() {
  VLOG(3) << "get var grad " << var_desc_->Name();
  return *grads_->GetMutable<framework::LoDTensor>();
 }
-void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
+std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
+  if (!grad_op_desc_) {
-          << grad->Get<framework::LoDTensor>().data<float>()[0];
+    VLOG(3) << "op with no grad: " << op_desc_->Type();
-  if (!grads_) {
+    return {};
-    grads_ =
+  }
-        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
-                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
-  }
-  AddTo(grad, grads_);
-  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
-          << grads_->Get<framework::LoDTensor>().data<float>()[0];
-}
-std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
  VLOG(3) << "op grad " << grad_op_desc_->Type();
-  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
+  std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
-    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
+  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
-      // grad op inputs can be forward inputs, so not in grad_to_var.
+  for (auto it : grad_output_vars_) {
-      continue;
+    auto& outputs = grad_outputs[it.first];
-    }
+    for (size_t i = 0; i < it.second.size(); ++i) {
-    VLOG(3) << "op grad in var " << grad_invar;
+      tmp_vars.emplace_back(new framework::Variable());
-    block_->FindRecursiveOrCreateVar(grad_invar);
+      outputs.push_back(tmp_vars.back().get());
-    framework::Variable* var = scope->Var(grad_invar);
+      outputs.back()->GetMutable<framework::LoDTensor>();
-    const std::string& invar = grad_to_var_->at(grad_invar);
-    for (VarBase* varbase : *output_vars_) {
-      // Use the accumulated grads_ by sharing the input with grads_.
-      if (varbase->var_desc_->Name() == invar) {
-        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
-            varbase->grads_->Get<framework::LoDTensor>());
-        break;
-      }
    }
  }
-  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+  framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
-    VLOG(3) << "grad outvar " << outvar;
-    block_->FindRecursiveOrCreateVar(outvar);
+  // No need to do static infer shape here.
-    framework::Variable* var = scope->Var(outvar);
+  // grad_op_desc_->InferShape(*block_);
-    if (!var->IsInitialized()) {
-      framework::VarDesc* var_desc = block_->FindVar(outvar);
-      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-        var->GetMutable<framework::LoDTensor>();
-      } else {
-        LOG(ERROR) << "tracer doesn't support yet";
-      }
-    }
-  }
-  grad_op_desc_->InferShape(*block_);
  grad_op_desc_->InferVarType(block_);
  std::unique_ptr<framework::OperatorBase> opbase =
      framework::OpRegistry::CreateOp(*grad_op_desc_);
+  framework::OperatorWithKernel* op_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-  opbase->Run(*scope, platform::CPUPlace());
+  framework::Scope scope;
+  platform::CPUPlace place;
+  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+  p.op.RuntimeInferShape(scope, place, ctx);
+  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
-  // `ret` matches exactly with `input_vars_` of forward op.
+  for (auto it : grad_output_vars_) {
-  std::vector<Variable*> ret;
+    auto& outputs = grad_outputs[it.first];
-  for (size_t i = 0; i < input_vars_->size(); ++i) {
+    auto& origin_outputs = it.second;
-    bool found = false;
+    for (size_t i = 0; i < outputs.size(); ++i) {
-    VarBase* origin_var = (*input_vars_)[i];
+      framework::Variable* orig_grad = origin_outputs[i];
-    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+      AddTo(outputs[i], orig_grad);
-      Variable* var = scope->FindVar(outvar);
-      std::string orig_var = grad_to_var_->at(outvar);
-      if (origin_var->var_desc_->Name() != orig_var) {
-        continue;
    }
-      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
-      origin_var->ApplyGrad(scope, var);
-      found = true;
-      ret.push_back(var);
-      // TODO(panyx0718): There might be another outvar with the same name.
-      // In that case, it doesn't matter the first one or the second one is
-      // used.
-      break;
  }
-    if (!found) {
+  return input_vars_;
-      ret.push_back(nullptr);
-    }
-  }
-  return ret;
 }
-void VarBase::RunBackward(framework::Scope* scope) {
+void VarBase::RunBackward() {
-  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
-                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
-                          false);
  if (!pre_op_) return;
-  Autograd(scope).RunBackward(this);
+  auto grads_t = grads_->GetMutable<framework::LoDTensor>();
+  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
+  std::fill(data, data + grads_t->numel(), 1.0);
+  PADDLE_ENFORCE(
+      grads_ ==
+      pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
+  Autograd().RunBackward(this);
 }
 }  // namespace imperative

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,17 +14,69 @@
 #pragma once
+#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace imperative {
+class PreparedOp {
+ public:
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             framework::OperatorWithKernel::OpKernelFunc func,
+             platform::DeviceContext* dev_ctx)
+      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
+                            const framework::OperatorWithKernel& op,
+                            const platform::Place& place) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = op.AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(op.Type());
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW(
+          "There are no kernels which are registered in the %s operator.",
+          op.Type());
+    }
+    framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
+    auto expected_kernel_key = op.GetExpectedKernelType(
+        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+    auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+    if (kernel_iter == kernels.end() &&
+        expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
+      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+      expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
+      expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
+#endif
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
+                   KernelTypeToString(expected_kernel_key));
+    }
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+  }
+  const framework::OperatorBase& op;
+  const framework::RuntimeContext& ctx;
+  framework::OperatorWithKernel::OpKernelFunc func;
+  platform::DeviceContext* dev_ctx;
+};
 class OpBase;
 class VarBase {
@@ -33,18 +85,26 @@ class VarBase {
      : pre_op_(nullptr),
        pre_op_out_idx_(-1),
        var_desc_(nullptr),
-        var_(nullptr),
+        var_(new framework::Variable()),
-        grads_(nullptr) {}
+        grads_(new framework::Variable()) {}
-  virtual ~VarBase() {}
-  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+  virtual ~VarBase() {
+    if (var_) {
+      delete var_;
+      var_ = nullptr;
+    }
+    if (grads_) {
+      delete grads_;
+      grads_ = nullptr;
+    }
+  }
-  void RunBackward(framework::Scope* scope);
+  void RunBackward();
  framework::LoDTensor& Grad();
  OpBase* pre_op_;
+  std::string pre_op_out_name_;
  int pre_op_out_idx_;
  framework::VarDesc* var_desc_;
@@ -54,35 +114,24 @@ class VarBase {
 class OpBase {
 public:
-  OpBase()
+  OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
-      : input_vars_(new std::vector<VarBase*>()),
-        output_vars_(new std::vector<VarBase*>()),
-        pre_ops_(new std::vector<OpBase*>()),
-        pre_ops_out_idx_(new std::vector<int>()),
-        op_desc_(nullptr),
-        grad_op_desc_(nullptr) {}
  virtual ~OpBase() {
-    delete input_vars_;
-    delete output_vars_;
-    delete pre_ops_;
-    delete pre_ops_out_idx_;
    if (grad_op_desc_) delete grad_op_desc_;
-    if (grad_to_var_) delete grad_to_var_;
  }
-  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
-  std::vector<VarBase*>* input_vars_;
-  std::vector<VarBase*>* output_vars_;
-  std::vector<OpBase*>* pre_ops_;
-  std::vector<int>* pre_ops_out_idx_;
  framework::OpDesc* op_desc_;
  framework::OpDesc* grad_op_desc_;
-  std::unordered_map<std::string, std::string>* grad_to_var_;
+  std::map<std::string, std::vector<VarBase*>> input_vars_;
+  std::map<std::string, std::vector<VarBase*>> output_vars_;
+  std::map<std::string, std::vector<OpBase*>> pre_ops_;
+  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
+  std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
+  std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
  framework::BlockDesc* block_;
 };

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -20,7 +20,6 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -41,22 +40,28 @@ void CreateGradOp(const framework::OpDesc& op_desc,
  *grad_op_desc = grad_op_descs[0].release();
 }
+void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+  auto& var_t = var->Get<framework::LoDTensor>();
+  float* data =
+      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+          var_t.dims(), platform::CPUPlace());
+  std::fill(data, data + var_t.numel(), 0.0);
+}
 class Tracer {
 public:
  explicit Tracer(framework::BlockDesc* root_block,
                  framework::BlockDesc* startup_block)
-      : root_block_(root_block), startup_block_(startup_block) {
+      : root_block_(root_block), startup_block_(startup_block) {}
-    root_scope_ = new framework::Scope();
-    scopes_[root_block_] = root_scope_;
-    scopes_[startup_block_] = root_scope_;
-  }
-  virtual ~Tracer() { delete root_scope_; }
+  virtual ~Tracer() {}
-  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
+  void Trace(OpBase* op,
-             const std::vector<VarBase*>& outputs,
+             const std::map<std::string, std::vector<VarBase*>>& inputs,
+             const std::map<std::string, std::vector<VarBase*>>& outputs,
             framework::BlockDesc* block) {
-    framework::Scope* scope = GetScope(block);
+    std::map<std::string, VarBase*> vars;
    framework::OpDesc* op_desc = op->op_desc_;
    VLOG(3) << "tracer tracing " << op_desc->Type();
    op_desc->InferShape(*block);
@@ -64,77 +69,113 @@ class Tracer {
    std::unique_ptr<framework::OperatorBase> op_base =
        framework::OpRegistry::CreateOp(*op_desc);
-    *op->input_vars_ = inputs;
+    framework::VariableValueMap invars_map;
-    for (VarBase* input : inputs) {
+    framework::VariableValueMap outvars_map;
-      const std::string vname = input->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
+    op->input_vars_ = inputs;
-      input->var_ = var;
+    for (auto it : op->input_vars_) {
-      if (!var->IsInitialized()) {
+      auto& invars = invars_map[it.first];
-        framework::VarDesc* var_desc = block->FindVar(vname);
+      for (VarBase* inp : it.second) {
-        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+        PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
-          var->GetMutable<framework::LoDTensor>();
+                                op->op_desc_->Type(), inp->var_desc_->Name());
+        invars.push_back(inp->var_);
+        vars[inp->var_desc_->Name()] = inp;
+        if (inp->pre_op_) {
+          op->pre_ops_[it.first].push_back(inp->pre_op_);
+          op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
        } else {
-          LOG(ERROR) << "tracer doesn't support yet";
+          op->pre_ops_[it.first].push_back(nullptr);
-        }
        }
-      if (input->pre_op_) {
+        VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
-        op->pre_ops_->push_back(input->pre_op_);
+                << inp->var_->IsInitialized();
-        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
-      } else {
-        op->pre_ops_->push_back(nullptr);
      }
-      VLOG(3) << "input vname " << vname << " "
-              << var->Get<framework::LoDTensor>().dims().size();
    }
-    *op->output_vars_ = outputs;
+    op->output_vars_ = outputs;
+    for (auto it : op->output_vars_) {
+      auto& outvars = outvars_map[it.first];
+      const std::vector<VarBase*>& outputs = it.second;
      for (size_t i = 0; i < outputs.size(); ++i) {
-      const std::string vname = outputs[i]->var_desc_->Name();
+        VarBase* out = outputs[i];
-      framework::Variable* var = scope->Var(vname);
+        outvars.push_back(out->var_);
-      if (!var->IsInitialized()) {
+        vars[out->var_desc_->Name()] = out;
-        framework::VarDesc* var_desc = block->FindVar(vname);
+        framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+          out->var_->GetMutable<framework::LoDTensor>();
        } else {
          LOG(ERROR) << "tracer doesn't support yet";
        }
+        out->pre_op_ = op;
+        out->pre_op_out_name_ = it.first;
+        out->pre_op_out_idx_ = i;
+        VLOG(3) << "output vname " << out->var_desc_->Name() << " "
+                << out->var_->IsInitialized();
      }
-      outputs[i]->var_ = var;
-      outputs[i]->pre_op_ = op;
-      outputs[i]->pre_op_out_idx_ = i;
    }
    VLOG(3) << "tracer running " << op_desc->Type();
-    op_base->Run(*scope, platform::CPUPlace());
+    framework::RuntimeContext ctx(invars_map, outvars_map);
+    // TODO(panyx0718): Cache p.
+    framework::OperatorWithKernel* op_kernel =
+        dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
+    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+    framework::Scope scope;
+    platform::CPUPlace place;
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+    p.op.RuntimeInferShape(scope, place, ctx);
+    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
    if (block == startup_block_) {
      op->grad_op_desc_ = nullptr;
-      op->grad_to_var_ = nullptr;
    } else {
      framework::OpDesc* grad_op_desc;
      auto grad_to_var = new std::unordered_map<std::string, std::string>();
      CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
      op->grad_op_desc_ = grad_op_desc;
-      op->grad_to_var_ = grad_to_var;
+      for (auto it : grad_op_desc->Inputs()) {
+        auto& grad_in_vars = op->grad_input_vars_[it.first];
+        for (const std::string& grad_invar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_invar);
+          auto var_it = grad_to_var->find(grad_invar);
+          if (var_it == grad_to_var->end()) {
+            auto fwd_var_it = vars.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            grad_in_vars.push_back(fwd_var_it->second->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->IsInitialized()) {
+              InitVar(var->var_, var->grads_);
            }
-    op->block_ = block;
+            grad_in_vars.push_back(var->grads_);
          }
-  framework::Scope* GetScope(framework::BlockDesc* block) {
-    if (scopes_.find(block) != scopes_.end()) {
-      return scopes_.at(block);
        }
-    framework::BlockDesc* parent_block = block->ParentBlock();
+      }
-    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
+      for (auto it : grad_op_desc->Outputs()) {
-    framework::Scope* scope = &scopes_[parent_block]->NewScope();
+        auto& grad_out_vars = op->grad_output_vars_[it.first];
-    scopes_[block] = scope;
+        for (const std::string& grad_outvar : it.second) {
-    return scope;
+          block->FindRecursiveOrCreateVar(grad_outvar);
+          auto var_it = grad_to_var->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != grad_to_var->end());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->IsInitialized()) {
+            InitVar(var->var_, var->grads_);
+          }
+          grad_out_vars.push_back(var->grads_);
+        }
+      }
+    }
+    op->block_ = block;
  }
 private:
-  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
  framework::BlockDesc* root_block_;
  framework::BlockDesc* startup_block_;
-  framework::Scope* root_scope_;
 };
 }  // namespace imperative

--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase {
    OpComment comment;
    PADDLE_ENFORCE(context->HasInput("X"),
                   "Input(X) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
    context->SetOutputDim("Out", context->GetInputDim("X"));
    context->ShareLoD("X", "Out");
  }

--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -19,6 +19,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/cudnn_helper.h"
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_int64(cudnn_exhaustive_search_times);
 namespace paddle {
 namespace operators {
@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 template <typename TAlgorithm>
 class AlgorithmsCache {
 public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
  // Caches the best algorithm for a given
  // combination of tensor dimensions & compute data type.
  TAlgorithm GetAlgorithm(
@@ -54,9 +59,14 @@ class AlgorithmsCache {
      int algorithmFlags,  // can set for different data type
      std::function<TAlgorithm()> gen_func);
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
 private:
  std::unordered_map<int64_t, TAlgorithm> hash_;
  std::mutex mutex_;
+  int search_times_;
 };
 template <typename TAlgorithm>
@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
  return hash_[seed];
 }
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/conv_fusion_op.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cc
@@ -28,6 +28,8 @@ namespace operators {
 //         x is Input,
 //         z is ResidualData,
 //         bias is Bias
+// When `split_channels` is set, y will be splitted into multiple outputs,
+// each output has split_channels[i] number of channels.
 class Conv2DFusionOpMaker : public Conv2DOpMaker {
 protected:
  void Apply() override {
@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
        "'relux' , 'tanh', 'band_pass'")
        .SetDefault("relu");
+    AddAttr<std::vector<int>>(
+        "split_channels",
+        "When `split_channels` are set, there will be multiple outputs, the "
+        "output size is equal to the number of `split_channels`.")
+        .SetDefault({});
+    AddOutput("Outputs",
+              "This Outputs is used when setting `split_channels`."
+              "Usually used to fuse conv with same input and same filter size, "
+              "padding, stride, dilation size.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("AlgoCache",
+             "The cache of convolution algorithm, a RAW type variable.")
+        .AsDispensable();
+    AddAttr<int>(
+        "search_times",
+        "The number of exhaustive search times for convolution algorithm.")
+        .SetDefault(-1);
  }
 };
+class Conv2DFusionOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of ConvOp should not be null.");
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+    std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+    }
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of ConvOp should not be null.");
+    ctx->SetOutputDim("Output", framework::make_ddim(oshape));
+    std::vector<int> channels =
+        ctx->Attrs().Get<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
+                     "Output(Outputs) of ConvOp should not be null.");
+      std::vector<framework::DDim> oshapes;
+      oshapes.reserve(channels.size());
+      for (size_t i = 0; i < channels.size(); ++i) {
+        oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
+      }
+      ctx->SetOutputsDim("Outputs", oshapes);
+    }
+  }
+};
 // TODO(qingqing): add gradient operator for conv2d_fusion
 }  // namespace operators
@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
-                  ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker);
+                  ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -16,8 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
-DECLARE_uint64(conv_workspace_size_limit);
+DEFINE_int64(cudnn_exhaustive_search_times, -1,
-DECLARE_bool(cudnn_exhaustive_search);
+             "Exhaustive search times for cuDNN convolution, "
+             "defalut is 1, only search once.");
 namespace paddle {
 namespace operators {
@@ -117,20 +118,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
          workspace_size_limit, &algo));
      VLOG(3) << "cuDNN forward algo " << algo;
    } else {
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      auto search_func = [&]() {
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-        algo_cache =
-            ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      }
-      algo = algo_cache->GetAlgorithm(
-          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
        int returned_algo_count;
        std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
            fwd_perf_stat;
@@ -138,20 +126,52 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
          CUDNN_ENFORCE(
              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                      filter_data, cudnn_conv_desc, cudnn_output_desc,
+                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
-                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      fwd_perf_stat.data(), cudnn_workspace,
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
-                      workspace_size_limit));
        };
        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
        VLOG(3) << "Perf result: (algo: stat, time, memory)";
        for (int i = 0; i < returned_algo_count; ++i) {
          const auto& stat = fwd_perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
+          VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
-                      << " " << stat.memory;
+                  << stat.memory;
        }
        return fwd_perf_stat[0].algo;
-          });
+      };
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      int search_times = ctx.Attr<int>("search_times");
+      search_times = std::max(
+          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      if (search_times > 0) {
+        // The searched algo will be cached by `search_times` times for
+        // different input dimension. For other dimensions, select the algo
+        // of closest area.
+        auto var_name = ctx.Inputs("AlgoCache")[0];
+        algo_cache =
+            ctx.scope()
+                .FindVar(var_name)
+                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+                                        search_func);
+      } else {
+        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
+        // all conv ops use the same kCUDNNFwdAlgoCache variable.
+        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+          algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        } else {
+          // TODO(qingqing) remove const_cast
+          algo_cache =
+              const_cast<framework::Scope*>(ctx.scope().parent())
+                  ->Var(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        }
+        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
+                                        dilations, 0, search_func);
+      }
      VLOG(3) << "choose algo " << algo;
    }
@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
      };
      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
    }
+    std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
+      if (x_dims[0] == 1) {
+        // share data with Output
+        framework::Tensor t;
+        t.ShareDataWith(*output);
+        auto y_dims = output->dims();
+        t.Resize({y_dims[1], y_dims[2], y_dims[3]});
+        int s = 0;
+        for (size_t i = 0; i < channels.size(); ++i) {
+          int e = s + channels[i];
+          outs[i]->ShareDataWith(t.Slice(s, e));
+          outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
+          s = e;
+        }
+      } else {
+        // TODO(qingiqng): do copy when batch size large than 1
+        PADDLE_THROW("Batch size greater than 1 is Unsupported");
+      }
+    }
  }
 };
 #endif

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) {
  }
  out->mutable_data<T>(out_dims, context.GetPlace());
  auto x_stride = framework::stride(x->dims());
-  auto out_stride = framework::stride(out->dims());
  auto offsets = GetOffsets(context);
  int64_t offset = 0;
  for (size_t i = 0; i < offsets.size(); ++i) {

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -147,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
            ->GetMutable<CudnnRNNCache>();
    auto input_dims = input->dims();
-    auto weight_dims = weight->dims();
    auto init_h_dims = init_h->dims();
    auto init_c_dims = init_c->dims();
    in_grad->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -27,8 +27,8 @@ struct StridedMemcpyFunctor;
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
+                  const int64_t* src_stride, const int64_t* dst_dim,
-                  framework::Dim<0> dst_stride, T* dst) const {
+                  const int64_t* dst_stride, T* dst) const {
    auto place = dev_ctx.GetPlace();
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
@@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> {
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
+                  const int64_t* src_stride, const int64_t* dst_dim,
-                  framework::Dim<1> dst_stride, T* dst) const {
+                  const int64_t* dst_stride, T* dst) const {
    auto place = dev_ctx.GetPlace();
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
    } else {
 #ifdef PADDLE_WITH_CUDA
      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
      auto& cuda_ctx =
          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
                   cuda_ctx.stream());
 #else
      PADDLE_THROW("Paddle is not compiled with GPU");
@@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
+                  const int64_t* src_stride, const int64_t* dst_dim,
-                  framework::Dim<Rank> dst_stride, T* dst) const {
+                  const int64_t* dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim.head; ++i) {
+    for (int64_t i = 0; i < dst_dim[0]; ++i) {
      StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
+      func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
-      src += src_stride.head;
+      src += src_stride[0];
-      dst += dst_stride.head;
+      dst += dst_stride[0];
    }
  }
 };
 template <typename T>
-struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+struct StridedCopyDimVisitor {
  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
                        const framework::DDim& src_stride,
                        const framework::DDim& dst_stride, T* dst)
@@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> {
        dst_stride_(dst_stride),
        dst_(dst) {}
-  template <typename Dim>
+  template <int D>
-  void operator()(Dim dst_dim) const {
+  void operator()(const framework::Dim<D>& dst_dim) const {
-    Dim src_stride = boost::get<Dim>(src_stride_);
+    StridedMemcpyFunctor<T, D> functor;
-    Dim dst_stride = boost::get<Dim>(dst_stride_);
+    functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(),
-    constexpr int dim = Dim::dimensions;
+            dst_);
-    StridedMemcpyFunctor<T, dim> functor;
-    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
  }
  const platform::DeviceContext& dev_ctx_;

--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
    auto im_info_dims = ctx->GetInputDim("ImInfo");

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("Variances"),
                   "Input(Variances) shouldn't be null.");
-    auto scores_dims = ctx->GetInputDim("Scores");
-    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto anchors_dims = ctx->GetInputDim("Anchors");
-    auto variances_dims = ctx->GetInputDim("Variances");
    ctx->SetOutputDim("RpnRois", {-1, 4});
    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
  }

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
    auto anchor_dims = ctx->GetInputDim("Anchor");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto im_info_dims = ctx->GetInputDim("ImInfo");
    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
                      "The rank of Input(Anchor) must be 2.");

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -7,56 +7,52 @@ if(WITH_GRPC)
 else()
    set(cc_generic_services "true")
 endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY)
+# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 if(WITH_GRPC)
-  grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
-        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
+  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-      PROTO send_recv.proto 
+        request_handler_impl.cc rpc_client.cc rpc_server.cc
+        variable_response.cc
+        collective_client.cc collective_server.cc
+        ${GRPC_SRCS}
+      PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto 
      DEPS lod_tensor selected_rows_functor memory)
  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
-  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
+  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
-    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL)
+    DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
-  cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
-  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-  if(WITH_GPU)
-  cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-      selected_rows_functor  scope math_function SERIAL)
-  endif()
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 else()
  set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
      collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc
+  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-    PROTO send_recv.proto
+      request_handler_impl.cc rpc_client.cc rpc_server.cc
+      variable_response.cc
+      collective_client.cc collective_server.cc
+      ${BRPC_SRCS}
+    PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
    DEPS lod_tensor selected_rows memory)
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
+  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
-  set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
+      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
-      proto_desc lookup_sparse_table_op snappystream snappy zlib)
+endif()
-  cc_test(rpc_server_test SRCS rpc_server_test.cc
-      DEPS ${brpc_test_depends} SERIAL)
-  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
-      DEPS ${brpc_test_depends} SERIAL)
-  if(WITH_GPU)
+cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
+cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
+cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
+if(WITH_GPU)
    cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS ${brpc_test_depends} selected_rows_functor  scope math_function SERIAL)
+        DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-  endif()
+        selected_rows_functor  scope math_function SERIAL)
 endif()
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 namespace paddle {

--- a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
@@ -14,7 +14,7 @@
 #ifdef PADDLE_WITH_BRPC_RDMA
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
 #include "brpc/channel.h"
 #include "brpc/rdma/rdma_helper.h"
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/operators/distributed/brpc_rdma_pool.h
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/brpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 namespace sendrecv {

--- a/paddle/fluid/operators/distributed/brpc_server.h
+++ b/paddle/fluid/operators/distributed/brpc_server.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <string>
 #include "brpc/server.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/brpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 //
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/brpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"

--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 DECLARE_int32(rpc_deadline);

--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"

--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
  framework::Scope* scope = new framework::Scope();
  framework::Variable* var = scope->Var("var1");
  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
+  slr->set_height(20000);
  auto* tensor = slr->mutable_value();
  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->Resize(framework::make_ddim({20000, 1024}));
  tensor->mutable_data<float>(place);
  paddle::operators::math::set_constant(ctx, tensor, 32.7);
@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
 }
 TEST(PREFETCH, GPU) {
+  setenv("FLAGS_max_body_size", "2147483647", 1);
  platform::CUDAPlace place;
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& ctx = *pool.Get(place);

--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@@ -18,15 +18,15 @@
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
 #define RPCCLIENT_T paddle::operators::distributed::GRPCClient
 #else  // PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
 #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
 #define RPCCLIENT_T paddle::operators::distributed::BRPCClient

--- a/paddle/fluid/operators/distributed/distributed_pb.h
+++ b/paddle/fluid/operators/distributed/distributed_pb.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_DISTRIBUTE
+#ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#else  // PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#endif  // PADDLE_WITH_GRPC
+#endif  // PADDLE_WITH_DISTRIBUTE
--- a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -39,10 +39,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"

--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -27,8 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <limits>
 #include <string>
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 using ::grpc::ServerAsyncResponseWriter;

--- a/paddle/fluid/operators/distributed/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc_server.h
@@ -29,11 +29,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/grpc_service.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -23,7 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 // NOTE: This method was originally created by tensorflow

--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -19,7 +19,7 @@
 #include <nccl.h>
 #endif
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.h
@@ -22,13 +22,11 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 #include <fstream>
 #include <iostream>
 #include <limits>
 #include <string>
-#include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"

--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -18,13 +17,8 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -33,19 +27,12 @@ service SendRecvService {
  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
-// VariableMessage is serialized paddle variable message.
-// It can be:
-// LoDTensor
-// SelectedRows
 enum VarType {
  LOD_TENSOR = 0;
  SELECTED_ROWS = 1;
  NCCL_ID = 2;
 }
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
 message VariableMessage {
  enum Type {
    // Pod Types
@@ -62,21 +49,14 @@ message VariableMessage {
  string varname = 1;
  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
  Type data_type = 3;
  repeated int64 dims = 4;
-  // lod details:
  int64 lod_level = 5;
  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
  int64 slr_height = 7;
-  // tensor data
  bytes serialized = 8;
-  // selected_rows data
  bytes rows = 9;
-  // Look up table block execution output variable name.
  string out_varname = 10;
  // If 1, the ps server will start profiling, the ps
  // server stops profiling and generates a profile to /tmp/profile_ps_*

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/port.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -25,7 +25,7 @@
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 DECLARE_string(rpc_server_profile_path);

--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/string/printf.h"

--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"

--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"

--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {

--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                      "Rank of first input must >= rank of second input.");

--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> {
    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
    auto* out0 = context.Output<Tensor>("Out");
    Eigen::DSizes<int, Rank> bcast_dims;
-    auto x_dims = in0->dims();
    for (size_t i = 0; i < expand_times.size(); ++i) {
      bcast_dims[i] = expand_times[i];
    }

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto w = ctx.Input<Tensor>("W");
    auto bias = ctx.Input<Tensor>("Bias");
    auto output = ctx.Output<Tensor>("Out");
-    auto in_dims = input->dims();
    auto w_dims = w->dims();
    auto out_dims = output->dims();
    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];

--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
-class FillConstantInferShape : public framework::InferShapeBase {
+class FillConstantOp : public framework::OperatorWithKernel {
 public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
    ctx->SetOutputDim("Out", framework::make_ddim(shape));
  }
-};
-class FillConstantOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
-    framework::Tensor *tensor = nullptr;
-    auto &out_var = *scope.FindVar(Output("Out"));
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, data_type);
-    } else {
-      tensor->mutable_data(dev_place, data_type);
-    }
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+ protected:
-    auto &dev_ctx = *pool.Get(dev_place);
+  framework::OpKernelType GetExpectedKernelType(
-    math::set_constant(dev_ctx, tensor, value);
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
  }
 };
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc *block) const override {}
+                  framework::BlockDesc* block) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        boost::get<int>(op_desc.GetAttr("dtype")));
+    auto& out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetDataType(data_type);
+  }
 };
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -107,7 +79,13 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
-                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantOpVarTypeInference,
-                  ops::FillConstantOpVarTypeInference);
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                       ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<int64_t>,
+                       ops::FillConstantKernel<int>,
+                       ops::FillConstantKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/fill_constant_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                        ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<int64_t>,
+                        ops::FillConstantKernel<int>,
+                        ops::FillConstantKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class FillConstantKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto value = ctx.Attr<float>("value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+    framework::Tensor *tensor = nullptr;
+    framework::Variable *out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+    if (force_cpu) {
+      tensor->mutable_data(platform::CPUPlace(), data_type);
+    } else {
+      tensor->mutable_data(ctx.GetPlace(), data_type);
+    }
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+    math::set_constant(dev_ctx, tensor, value);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
 include(operators)
-register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
+register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
 if (WITH_GPU)
  op_library(fusion_transpose_flatten_concat_op)
+  op_library(fusion_conv_inception_op)
  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
+  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
 endif()
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -243,7 +243,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
 #define INIT_BASE_SIZES                                      \
  auto ids_dims = ids->dims();                   /* T x M*/  \
-  auto ids_numel = ids->numel(); /* T x 1*/   \
+  auto ids_numel = framework::product(ids_dims); /* T x 1*/  \
  auto wh_dims = wh->dims();                     /* D x 4D*/ \
  const int D = wh_dims[0];                                  \
  const int D2 = D * 2;                                      \

--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+class ConvInceptionFusionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // 1 x
+    auto in_dims = ctx->GetInputDim("Input");
+    // 4 filters
+    auto w_dims = ctx->GetInputsDim("Filter");
+    PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
+    PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
+    int n = in_dims[0];
+    // compute output channel
+    // 1st channel
+    int c = w_dims[0][0];
+    // add 2nd channel
+    c += (w_dims[1][0] - w_dims[2][1] * 2);
+    // add 3rd channel
+    c += (w_dims[2][0] - w_dims[3][1]);
+    // add 4-th channel
+    c += w_dims[3][0];
+    int h = in_dims[2];
+    int w = in_dims[3];
+    ctx->SetOutputDim("Output", {n, c, h, w});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
+  }
+};
+class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  void Make() override {
+    AddInput("Input", "(Tensor) NCHW layout.");
+    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
+    AddInput("Bias", "(vector<Tensor>) it's lenght is equal to Filter")
+        .AsDuplicable();
+    AddOutput("Output",
+              "(Tensor) The output tensor of convolution operator. "
+              "The format of output tensor is also NCHW.");
+    AddOutput("TempOutput", "").AsDuplicable();
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddAttr<bool>(
+        "exclusive",
+        "(bool, default True) When true, will exclude the zero-padding in the "
+        "averaging calculating, otherwise, include the zero-padding. Note, it "
+        "is only used when pooling_type is avg. The defalut is True.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "activation",
+        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
+        "'relux' , 'tanh', 'band_pass'")
+        .SetDefault("relu");
+    AddAttr<int>("workspace_size_MB",
+                 "Only used in cudnn kernel. Need set use_cudnn to true."
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
+                  ops::ConvInceptionFusionOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+DECLARE_uint64(conv_workspace_size_limit);
+namespace paddle {
+namespace operators {
+#if CUDNN_VERSION >= 7001
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
+using DataLayout = platform::DataLayout;
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using PoolingMode = platform::PoolingMode;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto* input = ctx.Input<Tensor>("Input");
+    auto filters = ctx.MultiInput<framework::Tensor>("Filter");
+    auto bias = ctx.MultiInput<framework::Tensor>("Bias");
+    auto* output = ctx.Output<Tensor>("Output");
+    auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
+    const std::string pool_type = ctx.Attr<std::string>("pooling_type");
+    const std::string activation = ctx.Attr<std::string>("activation");
+    const bool exclusive = ctx.Attr<bool>("exclusive");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    T* temp_data = temp_outs[0]->mutable_data<T>(input->dims(), ctx.GetPlace());
+    DataLayout layout = DataLayout::kNCHW;
+    std::vector<int> in_dim = framework::vectorize2int(input->dims());
+    // ------------------- cudnn descriptors ---------------------
+    PoolingMode pooling_mode;
+    if (pool_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : (PoolingMode::kAverageInclusive);
+    }
+    std::vector<int> k0x0 = {0, 0};
+    std::vector<int> k1x1 = {1, 1};
+    std::vector<int> k1x1_2 = {1, 1};
+    std::vector<int> k3x3 = {3, 3};
+    ScopedPoolingDescriptor pool_desc;
+    ScopedActivationDescriptor act_desc;
+    ScopedTensorDescriptor out_pool_desc;
+    ScopedTensorDescriptor input_desc;
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
+    cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
+    cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
+    cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
+    cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
+    cudnnConvolutionDescriptor_t* conv_desc =
+        new cudnnConvolutionDescriptor_t[4];
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
+    }
+    std::vector<std::vector<int>> filter_dims;
+    std::vector<std::vector<int>> bias_dims;
+    std::vector<std::vector<int>> in_dims;
+    std::vector<std::vector<int>> out_dims;
+    std::vector<std::vector<int>> in_strides;
+    std::vector<std::vector<int>> out_strides;
+    std::vector<std::vector<int>> bias_strides;
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+    int n = in_dim[0];
+    int h = in_dim[2];
+    int w = in_dim[3];
+    int oc = output->dims()[1];
+    cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
+                                       ? CUDNN_DATA_DOUBLE
+                                       : CUDNN_DATA_FLOAT;
+    for (int i = 0; i < 4; ++i) {
+      filter_dims.push_back(framework::vectorize2int(filters[i]->dims()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+          filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
+      bias_dims.push_back({1, filter_dims[i][0], 1, 1});
+      bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
+          bias_strides[i].data()));
+      in_dims.push_back({n, filter_dims[i][1], h, w});
+      out_dims.push_back({n, filter_dims[i][0], h, w});
+      in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
+      out_strides.push_back({oc * h * w, h * w, w, 1});
+      if (i < 2) {
+        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
+            conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
+            CUDNN_CROSS_CORRELATION, compute_type));
+      } else {
+        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
+            conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
+            CUDNN_CROSS_CORRELATION, compute_type));
+      }
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          conv_desc[i], CUDNN_DEFAULT_MATH));
+    }
+    in_dims[2][1] *= 2;
+    in_strides[2][0] = oc * h * w;
+    out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
+    in_strides[3][0] = filter_dims[2][0] * h * w;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
+    cudnnConvolutionFwdAlgo_t algo[4];
+    auto handle = dev_ctx.cudnn_handle();
+    size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
+    }
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
+          out_strides[i].data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
+          &algo[i]));
+      size_t tmp_size = 0;
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
+          algo[i], &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    cudnnActivationDescriptor_t cudnn_act_desc =
+        act_desc.descriptor<T>(activation);
+    int oc0 = filter_dims[0][0];
+    int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
+    int oc3 = filter_dims[3][0];
+    int oc2 = oc - oc0 - oc1 - oc3;
+    // branch1: pool + 1x1 conv
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        pool_out_desc, temp_data));
+    std::vector<const void*> in_datas;
+    in_datas.push_back(static_cast<const void*>(temp_data));
+    in_datas.push_back(static_cast<const void*>(input_data));
+    in_datas.push_back(
+        static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
+    T* temp2_data = temp_outs[1]->mutable_data<T>(
+        framework::make_ddim(out_dims[2]), ctx.GetPlace());
+    in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
+    std::vector<void*> out_datas;
+    out_datas.push_back(static_cast<void*>(output_data));
+    out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
+    out_datas.push_back(static_cast<void*>(temp2_data));
+    out_datas.push_back(
+        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
+    for (int i = 0; i < 4; ++i) {
+      auto func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
+            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
+            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
+            out_desc[i], out_datas[i], bias_desc[i],
+            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+            out_desc[i], out_datas[i]));
+      };
+      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      workspace_handle.RunFunc(func, workspace_size_in_bytes);
+    }
+    cudnnTensorDescriptor_t x_desc;
+    cudnnTensorDescriptor_t y_desc;
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
+        handle, CudnnDataType<T>::kOne(), x_desc,
+        static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
+        y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
+  }
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
+#if CUDNN_VERSION >= 7001
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
+                        ops::CUDNNConvInceptionFusionOpKernel<float>,
+                        ops::CUDNNConvInceptionFusionOpKernel<double>);
+#endif
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
                   "Input(Logits@GRAD) should not be null.");
    auto pred_dims = ctx->GetInputDim("Logits");
-    auto lab_dims = ctx->GetInputDim("Labels");
    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);

--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel {
                   "Output(Predicted@GRAD) should not be null.");
    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);

--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  for (int i = 0; i < Rank; i++) {
    permute[i] = axis[i];
  }
-  auto in_dim = in.dims();
-  auto out_dim = out->dims();
  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
  auto* dev = context.eigen_device();

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
  void operator()(const DeviceContext& context, const framework::Tensor* X,
                  framework::Tensor* Y) {
    auto in_dims = X->dims();
-    auto out_dims = Y->dims();
    const float* in_data = X->data<float>();
    float* out_data = Y->data<float>();
    const int kBatchDim = 0;

--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
                   "Input(Out@Grad) must not be null.");
    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));

--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel {
                   "Input(Out@GRAD) should not be null");
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_mat_dims = framework::flatten_to_2d(
-        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
-    auto y_mat_dims = framework::flatten_to_2d(
-        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");

--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("Input");
    auto label_dims = ctx->GetInputDim("Label");
-    auto w_dims = ctx->GetInputDim("Weight");
    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
    if (ctx->HasInput("Bias")) {

--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel<T> {
    out_norm->mutable_data<T>(ctx.GetPlace());
    auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
    int axis = ctx.Attr<int>("axis");
    if (axis < 0) axis = xdim.size() + axis;

--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
    int rois_num = rois->dims()[0];
    auto in_stride = framework::stride(in_dims);
-    auto roi_stride = framework::stride(rois->dims());
    auto out_stride = framework::stride(out->dims());
    const T* input_data = in->data<T>();

--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
               static_cast<T>(0));
-      auto out_grad_stride = framework::stride(out_grad->dims());
      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
        Tensor out_grad_t =
            out_grad->Slice(static_cast<int>(out_lod[0][i]),

--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                          const framework::DDim& dst_stride, T* dst) {
  paddle::operators::detail::StridedCopyDimVisitor<T> func(
      dev_ctx, src, src_stride, dst_stride, dst);
-  boost::apply_visitor(func, dst_dim);
+  dst_dim.apply_visitor(func);
 }
 // Strided numel memory copy from src to dst by the specified axis

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+cc_library(timer SRCS timer.cc)
+cc_test(timer_test SRCS timer_test.cc DEPS timer)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_R6
+CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,6 +53,12 @@ namespace platform {
 namespace dynload {
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
+#endif
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif

--- a/paddle/fluid/platform/timer.cc
+++ b/paddle/fluid/platform/timer.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/timer.h"
+namespace paddle {
+namespace platform {
+void Timer::Reset() {
+  _start.tv_sec = 0;
+  _start.tv_usec = 0;
+  _count = 0;
+  _elapsed = 0;
+  _paused = true;
+}
+void Timer::Start() {
+  Reset();
+  Resume();
+}
+void Timer::Pause() {
+  if (_paused) {
+    return;
+  }
+  _elapsed += Tickus();
+  ++_count;
+  _paused = true;
+}
+void Timer::Resume() {
+  gettimeofday(&_start, NULL);
+  _paused = false;
+}
+int Timer::Count() { return _count; }
+double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
+double Timer::ElapsedMS() { return _elapsed / 1000.0; }
+double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
+int64_t Timer::Tickus() {
+  gettimeofday(&_now, NULL);
+  return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
+         (_now.tv_usec - _start.tv_usec);
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdlib.h>
+#include "paddle/fluid/platform/port.h"
+namespace paddle {
+namespace platform {
+// A Standard Timer implementation for debugging
+class Timer {
+ public:
+  // a timer class for profiling
+  // Reset() will be called during initialization
+  // all timing variables will be set 0 in Reset()
+  Timer() { Reset(); }
+  void Reset();
+  void Start();
+  void Pause();
+  // Resume will get current system time
+  void Resume();
+  int Count();
+  // return elapsed time in us
+  double ElapsedUS();
+  // return elapsed time in ms
+  double ElapsedMS();
+  // return elapsed time in sec
+  double ElapsedSec();
+ private:
+  struct timeval _start;
+  struct timeval _now;
+  int _count;
+  int _elapsed;
+  bool _paused;
+  // get us difference between start and now
+  int64_t Tickus();
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/timer_test.cc
+++ b/paddle/fluid/platform/timer_test.cc
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/timer.h"
+#include "gtest/gtest.h"
+TEST(Timer, Reset) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+  timeline.Reset();
+}
+TEST(Timer, Start) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+}
+TEST(Timer, Pause) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+}
+TEST(Timer, Resume) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+  timeline.Resume();
+}
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/tracer.h"
 namespace paddle {
@@ -28,9 +27,7 @@ void BindTracer(pybind11::module *m) {
              framework::BlockDesc *startup_block) {
             new (&self) imperative::Tracer(root_block, startup_block);
           })
-      .def("trace", &imperative::Tracer::Trace)
+      .def("trace", &imperative::Tracer::Trace);
-      .def("get_scope", &imperative::Tracer::GetScope,
-           pybind11::return_value_policy::reference);
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() {
 }
 bool IsCompiledWithBrpc() {
-#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
+#ifndef PADDLE_WITH_DISTRIBUTE
-  return true;
+  return false;
-#else
+#endif
+#ifdef PADDLE_WITH_GRPC
  return false;
 #endif
+  return true;
 }
 bool IsCompiledWithDIST() {
@@ -124,9 +128,7 @@ PYBIND11_MODULE(core, m) {
  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
      .def(py::init<>())
      .def("_run_backward",
-           [](imperative::VarBase &self, framework::Scope *scope) {
+           [](imperative::VarBase &self) { self.RunBackward(); })
-             self.RunBackward(scope);
-           })
      .def("_grad", &imperative::VarBase::Grad)
      .def_property(
          "desc",
@@ -134,6 +136,12 @@ PYBIND11_MODULE(core, m) {
          [](imperative::VarBase &self, framework::VarDesc *var_desc) {
            self.var_desc_ = var_desc;
          },
+          py::return_value_policy::reference)
+      .def_property("var",
+                    [](const imperative::VarBase &self) { return self.var_; },
+                    [](imperative::VarBase &self, framework::Variable *var) {
+                      self.var_ = var;
+                    },
                    py::return_value_policy::reference);
  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,20 +28,53 @@ int main(int argc, char** argv) {
  for (int i = 0; i < argc; ++i) {
    new_argv.push_back(argv[i]);
  }
+  std::vector<std::string> envs;
+  std::vector<std::string> undefok;
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
+  envs.push_back("max_body_size");
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  new_argv.push_back(
+  envs.push_back("fraction_of_gpu_memory_to_use");
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
+  envs.push_back("allocator_strategy");
 #elif __clang__
-  new_argv.push_back(
+  envs.push_back("use_mkldnn");
-      strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
+  envs.push_back("initial_cpu_memory_in_mb");
-             "mb,allocator_strategy"));
+  envs.push_back("allocator_strategy");
-  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
+  undefok.push_back("use_mkldnn");
+  undefok.push_back("initial_cpu_memory_in_mb");
 #else
-  new_argv.push_back(
+  envs.push_back("use_pinned_memory");
-      strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
+  envs.push_back("use_mkldnn");
-             "mb,allocator_strategy"));
+  envs.push_back("initial_cpu_memory_in_mb");
-  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
+  envs.push_back("allocator_strategy");
+  undefok.push_back("use_mkldnn");
+  undefok.push_back("initial_cpu_memory_in_mb");
 #endif
+  if (envs.size() > 0) {
+    std::string env_string = "--tryfromenv=";
+    for (auto t : envs) {
+      env_string += t + ",";
+    }
+    env_string = env_string.substr(0, env_string.length() - 1);
+    new_argv.push_back(strdup(env_string.c_str()));
+    VLOG(1) << "gtest env_string:" << env_string;
+  }
+  if (undefok.size() > 0) {
+    std::string undefok_string = "--undefok=";
+    for (auto t : undefok) {
+      undefok_string += t + ",";
+    }
+    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
+    new_argv.push_back(strdup(undefok_string.c_str()));
+    VLOG(1) << "gtest undefok_string:" << undefok_string;
+  }
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();
  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -151,13 +151,17 @@ def __bootstrap__():
        read_env_flags.append('rpc_get_thread_num')
        read_env_flags.append('rpc_prefetch_thread_num')
        read_env_flags.append('rpc_disable_reuse_port')
+        if core.is_compiled_with_brpc():
+            read_env_flags.append('max_body_size')
+            #set brpc max body size
+            os.environ['FLAGS_max_body_size'] = "2147483647"
    if core.is_compiled_with_cuda():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
-            'sync_nccl_allreduce'
+            'cudnn_exhaustive_search_times', 'sync_nccl_allreduce'
        ]
    core.init_gflags([sys.argv[0]] +

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -272,8 +272,7 @@ class DataFeeder(object):
            dict: the result of conversion.
        Raises:
-            ValueError: If drop_last is False and the data batch which cannot
+            ValueError: If drop_last is False and the data batch which cannot fit for devices.
-            fit for devices.
        """
        def __reader_creator__():

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import collections
+from collections import defaultdict
 import contextlib
 import os
 import re
@@ -369,13 +370,11 @@ class Variable(object):
            self._ivar.desc = self.desc
    def _numpy(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
+        tensor = self._ivar.var.get_tensor()
-        tensor = core.get_variable_tensor(scope, self.desc.name())
        return np.array(tensor)
    def _backward(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
+        self._ivar._run_backward()
-        self._ivar._run_backward(scope)
    def _gradient(self):
        return np.array(self._ivar._grad())
@@ -648,20 +647,16 @@ class Operator(object):
                    self.desc.set_input(in_proto.name, [])
        if outputs is not None:
-            given = set()
-            need = set()
-            for n in outputs:
-                given.add(n)
            for m in proto.outputs:
-                need.add(m.name)
+                if (m.name not in outputs) and m.dispensable:
-            if not given == need:
+                    continue
-                raise ValueError(("Incorrect setting for output(s) of "
+                if not ((m.name in outputs) or m.dispensable):
-                                  "operator \"%s\". Need: [%s] Given: [%s]") %
+                    raise ValueError(
-                                 (type,
+                        ("Incorrect setting for output(s) of "
-                                  ", ".join(six.binary_type(e) for e in need),
+                         "operator \"%s\", should set: [%s].") % (type, m.name))
-                                  ", ".join(six.binary_type(e) for e in given)))
            for out_proto in proto.outputs:
+                if out_proto.name not in outputs:
+                    continue
                out_args = outputs[out_proto.name]
                if not isinstance(out_args, list):
                    out_args = [out_args]
@@ -692,20 +687,20 @@ class Operator(object):
        if _in_imperative_mode():
            self.iop = core.OpBase()
            self.iop.desc = self.desc
-            self.inputs = []
+            self.inputs = defaultdict(list)
            if inputs is not None:
-                for inp in inputs.values():
+                for k, v in six.iteritems(inputs):
-                    if isinstance(inp, Variable):
+                    if isinstance(v, Variable):
-                        self.inputs.append(inp)
+                        self.inputs[k].append(v._ivar)
-                    elif isinstance(inp, list) or isinstance(inp, tuple):
+                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.inputs.extend(inp[:])
+                        self.inputs[k].extend([var._ivar for var in v])
-            self.outputs = []
+            self.outputs = defaultdict(list)
            if outputs is not None:
-                for out in outputs.values():
+                for k, v in six.iteritems(outputs):
-                    if isinstance(out, Variable):
+                    if isinstance(v, Variable):
-                        self.outputs.append(out)
+                        self.outputs[k].append(v._ivar)
-                    elif isinstance(out, list) or isinstance(out, tuple):
+                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.outputs.extend(out[:])
+                        self.outputs[k].extend([var._ivar for var in v])
    def _has_kernel(self, op_type):
        return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1273,8 +1268,7 @@ class Block(object):
        op_desc = self.desc.append_op()
        op = Operator(block=self, desc=op_desc, *args, **kwargs)
        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
-                                       [v._ivar for v in op.outputs], self.desc)
        self.ops.append(op)
        return op
@@ -1325,8 +1319,7 @@ class Block(object):
        op_desc = self.desc._prepend_op()
        op = Operator(self, op_desc, *args, **kwargs)
        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
-                                       [v._ivar for v in op.outputs], self.desc)
        self.ops.insert(0, op)
        return op
@@ -1641,8 +1634,8 @@ class Program(object):
                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
                to print.
-        Returns
+        Returns:
-            (str): The debug string.
+            str : The debug string.
        Raises:
            ValueError: If any of required fields is not set and throw_on_error is

--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -46,8 +46,7 @@ def to_variable(value, block=None):
            name=None,
            shape=value.shape,
            dtype=value.dtype)
-        scope = framework._imperative_tracer().get_scope(block.desc)
+        var = py_var._ivar.var
-        var = scope.var(py_var.name)
        tensor = var.get_tensor()
        tensor.set(value, core.CPUPlace())
        return py_var

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -20,7 +20,7 @@ import six
 import sys
 import numpy as np
-from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from paddle.fluid.imperative import base
@@ -313,9 +313,20 @@ class LayerHelper(object):
            param = self._create_weight_normalize(attr, shape, dtype)
            WeightNormParamAttr.params_with_weight_norm.append(param)
            return param
+        if _in_imperative_mode():
+            self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
+            # In imperative mode, we want the returned parameter to be
+            # initialized so that it can be used imperatively.
+            return self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+        else:
            self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
            return self.main_program.global_block().create_parameter(
                dtype=dtype, shape=shape, **attr._to_kwargs())

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
    def step_input(self, x):
        """
        Mark a sequence as a dynamic RNN input.
        Args:
            x(Variable): The input sequence.
@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
        """
        Mark a variable as a RNN input. The input will not be scattered into
        time steps.
        Args:
            x(Variable): The input variable.
@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
        Args:
            init(Variable|None): The initialized variable.
-            shape(list|tuple): The memory shape. NOTE the shape does not contain
+            shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
-            batch_size.
            value(float): the initalized value.
-            need_reorder(bool): True if the initialized memory depends on the
+            need_reorder(bool): True if the initialized memory depends on the input sample.
-            input sample.
            dtype(str|numpy.dtype): The data type of the initialized memory.
@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
        """
        Update the memory from ex_mem to new_mem. NOTE that the shape and data
        type of :code:`ex_mem` and :code:`new_mem` must be same.
        Args:
            ex_mem(Variable): the memory variable.
            new_mem(Variable): the plain variable generated in RNN block.

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
                      rpn_negative_overlap=0.3,
                      use_random=True):
    """
-    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
    This layer can be, for given the  Intersection-over-Union (IoU) overlap
    between anchors and ground truth boxes, to assign classification and
@@ -148,6 +148,7 @@ def rpn_target_assign(bbox_pred,
                                              cls_logits=cls_logits,
                                              anchor_box=anchor_box,
                                              gt_boxes=gt_boxes)
    """
    helper = LayerHelper('rpn_target_assign', **locals())
@@ -1525,20 +1526,23 @@ def anchor_generator(input,
                                        anchors, e.g. [0.5, 1.0, 2.0].
       variance(list|tuple): The variances to be used in box regression deltas.
                             Default:[0.1, 0.1, 0.2, 0.2].
-       stride(list|turple): The anchors stride across width and height,
+       stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
-            e.g. [16.0, 16.0]
       offset(float): Prior boxes center offset. Default: 0.5
       name(str): Name of the prior box op. Default: None.
    Returns:
-        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
+        Anchors(Variable),Variances(Variable):  
-              H is the height of input, W is the width of input,
-              num_anchors is the box count of each position.
+              two variables:
+              - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
+                H is the height of input, W is the width of input, \
+                num_anchors is the box count of each position.  \
                Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. 
-        Variances(Variable): The expanded variances of anchors
+              - Variances(Variable): The expanded variances of anchors \
-              with a layout of [H, W, num_priors, 4].
+                with a layout of [H, W, num_priors, 4]. \
-              H is the height of input, W is the width of input
+                H is the height of input, W is the width of input \
-              num_anchors is the box count of each position.
+                num_anchors is the box count of each position. \
                Each variance is in (xcenter, ycenter, w, h) format.
@@ -1748,7 +1752,7 @@ def generate_proposals(scores,
                       eta=1.0,
                       name=None):
    """
-    ** Generate proposal Faster-RCNN **
+    **Generate proposal Faster-RCNN**
    This operation proposes RoIs according to each box with their probability to be a foreground object and 
    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
@@ -1762,7 +1766,6 @@ def generate_proposals(scores,
    4. Remove predicted boxes with small area. 
    5. Apply NMS to get final proposals as output.
    Args:
        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
            N is batch size, A is number of anchors, H and W are height and width of the feature map.
@@ -1777,6 +1780,7 @@ def generate_proposals(scores,
        nms_thresh(float): Threshold in NMS, 0.5 by default.
        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
    """
    helper = LayerHelper('generate_proposals', **locals())

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
    is determined by argument buf_size.
    Args:
-        param reader: the original reader whose output will be shuffled.
+        reader(callable): the original reader whose output will be shuffled.
-        type reader: callable
+        buf_size(int): shuffle buffer size.
-        param buf_size: shuffle buffer size.
-        type buf_size: int
+    Returns:
-        return: the new reader whose output is shuffled.
+        callable: the new reader whose output is shuffled.
-        rtype: callable
    """
    return __create_unshared_decorated_reader__(
        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -233,7 +233,7 @@ def fc(input,
            dimensions will be flatten to form the first dimension of the final matrix (height of
            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
            parameters/weights of this layer.
@@ -505,31 +505,33 @@ def lstm(input,
    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
-    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+    .. math::
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
-    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
-    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
-    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
-    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
-    $$ h_t = o_t \\odot tanh(c_t) $$
+       h_t &= o_t \odot tanh(c_t) 
-    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
      of weights from the input gate to the input)
    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
    - sigmoid is the logistic sigmoid function.
    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
      and cell activation vectors, respectively, all of which have the same size as
      the cell output activation vector $h$.
-    - The $\odot$ is the element-wise product of the vectors.
+    - The :math:`\odot` is the element-wise product of the vectors.
-    - `tanh` is the activation functions.
+    - :math:`tanh` is the activation functions.
-    - $\tilde{c_t}$ is also called candidate hidden state,
+    - :math:`\\tilde{c_t}` is also called candidate hidden state,
      which is computed based on the current input and the previous hidden state.
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
    X represensts a matrix multiplication
@@ -556,13 +558,17 @@ def lstm(input,
    Returns:
-        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
+                        Three tensors, rnn_out, last_h, last_c:
+                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-        last_h(Tensor): the hidden state of the last step of LSTM
+                        - last_h is the hidden state of the last step of LSTM \
-                        shape is ( num_layers x batch_size x hidden_size )
+                          shape is ( num_layers x batch_size x hidden_size ) \
                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
-        last_c(Tensor): the cell state of the last step of LSTM
+                        - last_c(Tensor): the cell state of the last step of LSTM \
-                        shape is ( num_layers x batch_size x hidden_size )
+                          shape is ( num_layers x batch_size x hidden_size ) \
                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
@@ -1220,6 +1226,8 @@ def dropout(x,
    probability) the outputs of some units to zero, while others are remain
    unchanged.
+    dropout op can be removed from the program to make the program more efficient.
    Args:
        x (Variable): The input tensor variable.
        dropout_prob (float): Probability of setting units to zero.
@@ -1230,20 +1238,22 @@ def dropout(x,
                    units will be dropped. DO NOT use a fixed seed in training.
        name (str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+        dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
                                        1. downgrade_in_infer(default), downgrade the outcome at inference
-                                           train: out = input * mask
-                                           inference: out = input * dropout_prob
+                                           - train: out = input * mask
-                                           (make is a tensor same shape with input, value is 0 or 1
+                                           - inference: out = input * dropout_prob
+                                           (mask is a tensor same shape with input, value is 0 or 1
                                           ratio of 0 is dropout_prob)
                                        2. upscale_in_train, upscale the outcome at training time
-                                           train: out = input * mask / ( 1.0 - dropout_prob )
-                                           inference: out = input
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program.
-                                           the program will be efficient
+                                           - train: out = input * mask / ( 1.0 - dropout_prob )
+                                           - inference: out = input
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
    Returns:
@@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
         A 2-D tensor with shape [N x 1], the cross entropy loss.
    Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
+         ValueError:
-                      2) when `soft_label == True`, and the 2nd dimension of
-                         `input` and `label` are not equal.
+                      1. the 1st dimension of ``input`` and ``label`` are not equal.
-                      3) when `soft_label == False`, and the 2nd dimension of
-                         `label` is not 1.
+                      2. when ``soft_label == True``, and the 2nd dimension of
+                         ``input`` and ``label`` are not equal.
+                      3. when ``soft_label == False``, and the 2nd dimension of
+                         ``label`` is not 1.
    Examples:
        .. code-block:: python
@@ -1458,7 +1472,7 @@ def chunk_eval(input,
    F1-score of chunk detection.
    For some basics of chunking, please refer to 
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None):
                out.lod = [[2, 1]],
                out.dims = (3, 2).
-    NOTE: The first dimension size of **input**, **offset** and **length**
+    Note: 
+          The first dimension size of **input**, **offset** and **length**
          should be equal. The **offset** should start from 0.
    Args:
@@ -2570,12 +2585,7 @@ def adaptive_pool2d(input,
        raise ValueError(
            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
-    def _is_list_or_tuple_(data):
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-        return (isinstance(data, list) or isinstance(data, tuple))
-    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
-        raise ValueError(
-            "'pool_size' should be a list or tuple with length as 2.")
    if pool_type == "max":
        l_type = 'max_pool2d_with_index'
@@ -2671,12 +2681,7 @@ def adaptive_pool3d(input,
        raise ValueError(
            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
-    def _is_list_or_tuple_(data):
+    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
-        return (isinstance(data, list) or isinstance(data, tuple))
-    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
-        raise ValueError(
-            "'pool_size' should be a list or tuple with length as 3.")
    if pool_type == "max":
        l_type = 'max_pool3d_with_index'
@@ -3013,7 +3018,7 @@ def group_norm(input,
    """
    **Group Normalization Layer**
-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
    Args:
        input(Variable): The input tensor variable.
@@ -3140,8 +3145,8 @@ def conv2d_transpose(input,
           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
    Args:
        input(Variable): The input image with [N, C, H, W] format.
@@ -4704,9 +4709,9 @@ def ctc_greedy_decoder(input, blank, name=None):
        name (str): The name of this layer. It is optional.
    Returns:
-        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
-                  'Lp' is the sum if all output sequences' length. If all the sequences
+                  'Lp' is the sum if all output sequences' length. If all the sequences \
-                  in result were empty, the result LoDTensor will be [-1] with
+                  in result were empty, the result LoDTensor will be [-1] with  \
                  LoD [[]] and dims [1, 1]. 
    Examples:
@@ -5072,6 +5077,7 @@ def hsigmoid(input,
    <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
    And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
    1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
@@ -5079,7 +5085,6 @@ def hsigmoid(input,
    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
       related to the same batch of inputs.
    Args:
        input (Variable): The input tensor variable with shape
            :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
@@ -5485,11 +5490,11 @@ def softmax_with_cross_entropy(logits,
    .. math::
-        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+        max_j &= \\max_{i=0}^{K}{\\text{logit}_i}
-        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
-        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+        softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
    and then cross entropy loss is calculated by softmax and label.
@@ -5515,10 +5520,10 @@ def softmax_with_cross_entropy(logits,
                               along with the cross entropy loss. Default: False
    Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if
+        Variable or Tuple of two Variables: Return the cross entropy loss if \
-                              `return_softmax` is False, otherwise the tuple
+                                            `return_softmax` is False, otherwise the tuple \
-                              (loss, softmax), where the cross entropy loss is
+                                            (loss, softmax), where the cross entropy loss is \
-                              a 2-D tensor with shape [N x 1], and softmax is a
+                                            a 2-D tensor with shape [N x 1], and softmax is a \
                                            2-D tensor with shape [N x K].
    Examples:
@@ -5792,15 +5797,21 @@ def squeeze(input, axes, name=None):
    the single dimensions will be removed from the shape. If an axis is
    selected with shape entry not equal to one, an error is raised.
-    Examples:
+    For example:
+    .. code-block:: text
        Case 1:
          Given
            X.shape = (1, 3, 1, 5)
          and
            axes = [0]
          we get:
            Out.shape = (3, 1, 5)
        Case 2:
          Given
            X.shape = (1, 3, 1, 5)
          and
@@ -5842,6 +5853,9 @@ def unsqueeze(input, axes, name=None):
    Dimension indices in axes are as seen in the output tensor.
    For example:
+    .. code-block:: text
      Given a tensor such that tensor with shape [3, 4, 5],
      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
@@ -6729,8 +6743,11 @@ def sequence_scatter(input, index, updates, name=None):
    the columns to update in each row of X.
    Here is an example:
    Given the following input:
    .. code-block:: text
        input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                      [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                      [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
@@ -6743,7 +6760,9 @@ def sequence_scatter(input, index, updates, name=None):
        updates.lod =  [[  0,            3,                                 8,                         12]]
    Then we have the output:
    .. code-block:: text
        out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
                    [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
                    [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
@@ -6759,7 +6778,7 @@ def sequence_scatter(input, index, updates, name=None):
        name (str|None): The output variable name. Default None.
    Returns:
-        output (Variable): The output is a tensor with the same shape as input.
+        Variable: The output is a tensor with the same shape as input.
    Examples:
@@ -6933,7 +6952,7 @@ def mean_iou(input, label, num_classes):
    .. math::
-        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+        IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.
    The predictions are accumulated in a confusion matrix and mean-IOU
    is then calculated from it.
@@ -6946,9 +6965,13 @@ def mean_iou(input, label, num_classes):
        num_classes (int): The possible number of labels.
    Returns:
-        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
-        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
+                     Three variables:
+                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
+                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
+                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
    Examples:
@@ -7144,7 +7167,7 @@ def affine_grid(theta, out_shape, name=None):
    Args:
        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
-        out_shape can be a Variable or a list or tuple.
+                                             ``out_shape`` can be a Variable or a list or tuple.
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
@@ -7157,6 +7180,7 @@ def affine_grid(theta, out_shape, name=None):
    Examples:
        .. code-block:: python
            theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
            out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
            data = fluid.layers.affine_grid(theta, out_shape)
@@ -7192,9 +7216,10 @@ def affine_grid(theta, out_shape, name=None):
 def rank_loss(label, left, right, name=None):
    """
    **Rank loss layer for RankNet**
-    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
    is a pairwise ranking model with a training sample consisting of a pair
    of documents, A and B. Label P indicates whether A is ranked higher than B
    or not:
@@ -7202,16 +7227,19 @@ def rank_loss(label, left, right, name=None):
    P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
    about the rank of the input pair.
-    Rank loss layer takes three inputs: left (o_i), right (o_j) and
+    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
-    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
    for documents A and B and the value of label P. The following equation
    computes rank loss C_{i,j} from the inputs:
-    $$
+    .. math::
-      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-      o_{i,j} =  o_i - o_j  \\
+      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
-      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-    $$
+      o_{i,j} &=  o_i - o_j  \\\\
+      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
@@ -7237,7 +7265,6 @@ def rank_loss(label, left, right, name=None):
            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
            out = fluid.layers.rank_loss(label, left, right)
    """
    helper = LayerHelper('rank_loss', **locals())
@@ -7269,7 +7296,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
    .. math::
-        rank\_loss &= max(0, -label * (left - right) + margin)
+        rank\_loss = max(0, -label * (left - right) + margin)
    Args:
       label (Variable): Indicates whether the left is ranked higher than the right or not.
@@ -7278,12 +7305,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
       margin (float): Indicates the given margin.
       name (str|None): A name for this layer (optional). If set None, the layer
                       will be named automatically.
    Returns:
       Variable: The ranking loss.
    Raises:
       ValueError: Any of label, left, and right is not a Variable.
    Examples:
        .. code-block:: python
           label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
           left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
           right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
@@ -7587,7 +7619,8 @@ def prelu(x, mode, param_attr=None, name=None):
    """
    Equation:
-        y = \max(0, x) + alpha * \min(0, x)
+    .. math::
+        y = \max(0, x) + \\alpha * \min(0, x)
    Args:
        x (Variable): The input tensor.
@@ -7730,20 +7763,29 @@ def flatten(x, axis=1, name=None):
    **Flatten layer**
    Flattens the input tensor into a 2D matrix.
-    Examples:
+    For Example:
+    .. code-block:: text
        Case 1:
          Given
            X.shape = (3, 100, 100, 4)
          and
            axis = 2
          We get:
            Out.shape = (3 * 100, 4 * 100)
        Case 2:
          Given
            X.shape = (3, 100, 100, 4)
          and
            axis = 0
          We get:
            Out.shape = (1, 3 * 100 * 100 * 4)
@@ -7759,9 +7801,9 @@ def flatten(x, axis=1, name=None):
                        will be named automatically.
    Returns:
-        Variable: A 2D tensor with the contents of the input tensor, with input
+        Variable: A 2D tensor with the contents of the input tensor, with input \
-                  dimensions up to axis flattened to the outer dimension of
+                  dimensions up to axis flattened to the outer dimension of \
-                  the output and remaining input dimensions flattened into the
+                  the output and remaining input dimensions flattened into the \
                  inner dimension of the output.
    Raises:
@@ -7801,15 +7843,19 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
    The enumerated sequence has the same 1st dimension with variable `input`, and
    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
-    Examples:
+    .. code-block:: text
        Case 1:
          Input:
            X.lod = [[0, 3, 5]]
            X.data = [[1], [2], [3], [4], [5]]
            X.dims = [5, 1]
          Attrs:
            win_size = 2
            pad_value = 0
          Output:
            Out.lod = [[0, 3, 5]]
            Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
@@ -8896,6 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None):
    SimilarityFocus Operator
    Generate a similarity focus mask with the same shape of input using the following method:
    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
       to the axis according to the indexes. For example, if axis=1 and indexes=[a],
       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -8969,14 +9016,16 @@ def similarity_focus(input, axis, indexes, name=None):
        indexes(list): Indicating the indexes of the selected dimension.
    Returns:
-        Variable: A tensor variable with the same shape and same type
+        Variable: A tensor variable with the same shape and same type \
                  as the input.
    Examples:
        .. code-block:: python
            data = fluid.layers.data(
              name='data', shape=[2, 3, 2, 2], dtype='float32')
            x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
    """
    helper = LayerHelper('similarity_focus', **locals())
    # check attrs
@@ -9055,6 +9104,7 @@ def hash(input, hash_size, num_hash=1, name=None):
    Examples:
       .. code-block:: python
           word_dict = paddle.dataset.imdb.word_dict()
           x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
           out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
@@ -9075,13 +9125,15 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
    """
    This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by affine_grid. The grid of
+    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
    with shape [N, H, W] each, where grid_x is indexing the 4th dimension
    (in width dimension) of input data x and grid_y is indexng the 3rd
    dimention (in height dimension), finally results is the bilinear
    interpolation value of 4 nearest corner points.
+    .. code-block:: text
        Step 1:
        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
@@ -9126,16 +9178,18 @@ def grid_sampler(x, grid, name=None):
        name (str, default None): The name of this layer.
    Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X
+        Variable: Output of shape [N, C, H, W] data samples input X
        using bilnear interpolation based on input grid.
-    Exmples:
+    Examples:
        .. code-block:: python
            x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
            theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
            grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
            out = fluid.layers.grid_sampler(x=x, grid=grid)
    """
    helper = LayerHelper("grid_sampler", **locals())
@@ -9203,19 +9257,19 @@ def add_position_encoding(input, alpha, beta, name=None):
    """
    **Add Position Encoding Layer**
-    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
    output Tensor of shape [N x M x P] with positional encoding value.
-    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
    .. math::
-        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})}   \\\\
-        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
+        PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})}  \\\\
-        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+        Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
    Where:
-    * PE(pos, 2i): the increment for the number at even position
+      - :math:`PE(pos, 2i)` : the increment for the number at even position
-    * PE(pos, 2i + 1): the increment for the number at odd position
+      - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
    Args:
        input (Variable): 3-D input tensor with shape [N x M x P]
@@ -9230,6 +9284,7 @@ def add_position_encoding(input, alpha, beta, name=None):
        .. code-block:: python
          position_tensor = fluid.layers.add_position_encoding(input=tensor)
    """
    helper = LayerHelper('add_position_encoding', **locals())
    dtype = helper.input_dtype()
@@ -9262,13 +9317,13 @@ def bilinear_tensor_product(x,
    For example:
    .. math::
-       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
    In this formula:
      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
      - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
    Args:

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,
    It also sets *stop_gradient* to True.
-    >>> data = fluid.layers.fill_constant_batch_size_like(
-    >>>             input=like, shape=[1], value=0, dtype='int64')
    Args:
        input(${input_type}): ${input_comment}.
@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,
    Returns:
        ${out_comment}.
+    Examples:
+        .. code-block:: python
+             data = fluid.layers.fill_constant_batch_size_like(
+                         input=like, shape=[1], value=0, dtype='int64')
    """
    helper = LayerHelper("fill_constant_batch_size_like", **locals())
    out = helper.create_variable_for_type_inference(dtype=dtype)

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -362,7 +362,7 @@ class ChunkEvaluator(MetricBase):
    compute the precision recall and F1-score using the accumulated counter
    numbers.
    For some basics of chunking, please refer to 
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
    ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
        """
        Update the states based on the layers.chunk_eval() ouputs.
        Args:
            num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
            num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
                distance, instance_error = distance_evaluator.eval()
    In the above example:
-        'distance' is the average of the edit distance in a pass.
-        'instance_error' is the instance error rate in a pass.
+        - 'distance' is the average of the edit distance in a pass.
+        - 'instance_error' is the instance error rate in a pass.
    """
@@ -567,12 +568,15 @@ class DetectionMAP(object):
    Calculate the detection mean average precision (mAP).
    The general steps are as follows:
    1. calculate the true positive and false positive according to the input
       of detection and labels.
    2. calculate mAP value, support two versions: '11 point' and 'integral'.
    Please get more information from the following articles:
      https://sanchom.wordpress.com/tag/average-precision/
      https://arxiv.org/abs/1512.02325
    Args:
@@ -615,8 +619,10 @@ class DetectionMAP(object):
    In the above example:
-            'cur_map_v' is the mAP of current mini-batch.
+            - 'cur_map_v' is the mAP of current mini-batch.
-            'accum_map_v' is the accumulative mAP of one pass.
+            - 'accum_map_v' is the accumulative mAP of one pass.
    """
    def __init__(self,

--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest):
        self.activation = 'relu'
        self.add_bias = True
        self.add_residual_data = True
+        self.channels = None
+        self.outputs = None
        self.init_group()
        self.init_dilation()
@@ -49,7 +51,7 @@ class TestConv2dFusionOp(OpTest):
        input = np.random.random(self.input_size).astype(self.dtype)
        filter = np.random.random(self.filter_size).astype(self.dtype)
-        output = conv2d_forward_naive(input, filter, self.groups,
+        self.output = conv2d_forward_naive(input, filter, self.groups,
                                           conv2d_param).astype(self.dtype)
        self.inputs = {
@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest):
        }
        if self.add_residual_data:
-            residual_data = np.random.random(output.shape).astype(self.dtype)
+            residual_data = np.random.random(self.output.shape).astype(
+                self.dtype)
            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
                residual_data)
-            output += residual_data
+            self.output += residual_data
        if self.add_bias:
            bias = np.random.random(self.filter_size[0]).astype(self.dtype)
            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
-            output = output + bias.reshape((1, bias.size, 1, 1))
+            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
        assert self.activation in ['relu', 'identity']
        if self.activation == 'relu':
-            output = np.maximum(output, 0)
+            self.output = np.maximum(self.output, 0)
        self.attrs = {
            'strides': self.stride,
@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest):
            'dilations': self.dilations,
            'data_format': self.data_format,
            'exhaustive_search': self.exhaustive_search,
-            'activation': self.activation
+            'activation': self.activation,
+            'split_channels': self.channels
        }
-        self.outputs = {'Output': output}
+        self.outputs = {'Output': self.output}
+        self.set_outputs()
    def testcuda(self):
        return core.is_compiled_with_cuda()
@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest):
    def set_search_method(self):
        self.exhaustive_search = False
+    def set_outputs(self):
+        pass
 class TestWithoutResidual(TestConv2dFusionOp):
    def init_bias_residual(self):
@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
        self.exhaustive_search = True
+class TestMultipleOutputs(TestConv2dFusionOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 17, 17]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [126, f_c, 3, 3]
+        self.channels = [84, 42]
+    def set_outputs(self):
+        out1 = self.output[:, 0:84, :, :]
+        out2 = self.output[:, 84:126, :, :]
+        self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -194,4 +194,6 @@ class TestDataBalance(unittest.TestCase):
 if __name__ == '__main__':
-    unittest.main()
+    # Disable data balance unittest, because data balance would be removed
+    # unittest.main()
+    pass
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -38,7 +38,9 @@ class MyLayer(fluid.imperative.PyLayer):
    def forward(self, inputs):
        x = fluid.layers.relu(inputs[0])
        self._x_for_debug = x
-        return [fluid.layers.elementwise_mul(x, x)]
+        x = fluid.layers.elementwise_mul(x, x)
+        x = fluid.layers.reduce_sum(x)
+        return [x]
 class MLP(fluid.imperative.PyLayer):

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase):
            pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
            self.assertIsNotNone(pool)
            self.assertIsNotNone(mask)
+            self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg'))
+            pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
    def test_adaptive_pool3d(self):
        program = Program()
@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase):
                x, [3, 3, 3], require_index=True)
            self.assertIsNotNone(pool)
            self.assertIsNotNone(mask)
+            self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg'))
+            pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
    def test_lstm_unit(self):
        program = Program()

--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -209,6 +209,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
            else:
                thread = threading.Thread(
                    target=feed_data, args=(feed_queue, reader))
+                thread.daemon = True
                thread.start()
            self.outputs = []
@@ -219,6 +220,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
            feed_queue.close()
            self.validate()
+            if not use_decorate_paddle_reader:
+                thread.join()
    def validate(self):
        self.assertEqual(len(self.inputs), len(self.outputs))

--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
    var_dict = {}
    for var_proto in proto_list:
        var_name = str(var_proto.name)
-        if is_input:
        if (var_name not in np_list) and var_proto.dispensable:
            continue
+        if is_input:
            assert (var_name in np_list) or (var_proto.dispensable), \
                "Missing {} as input".format(var_name)
        if var_proto.duplicable:

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
 class DistributeTranspilerConfig(object):
    """
-    Args:
+    .. py:attribute:: slice_var_up (bool)
-        slice_var_up (bool): Do Tensor slice for pservers, default is True.
-        split_method (PSDispatcher): RoundRobin or HashName can be used
+          Do Tensor slice for pservers, default is True.
-          try to choose the best method to balance loads for pservers.
-        min_block_size (int): Minimum splitted element number in block.
+    .. py:attribute:: split_method (PSDispatcher)
-          According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+          RoundRobin or HashName can be used.
+          Try to choose the best method to balance loads for pservers.
+    .. py:attribute:: min_block_size (int)
+          Minimum number of splitted elements in block.
+          According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
          We can use bandwidth effiently when data size is larger than 2MB.If you
-          want to change it, please be sure you see the slice_variable function.
+          want to change it, please be sure you have read the slice_variable function.
    """
    slice_var_up = True