Merge branch 'develop' of upstream into argsort_dev

9c69fdf5 · Yibing Liu · e710d2c6 · 6d6996af · 9c69fdf5 · 9c69fdf5
181 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
-        entry: bash ./.clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
@@ -52,7 +52,7 @@ repos:
    hooks:
    -   id: copyright_checker
        name: copyright_checker
-        entry: python ./.copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/Dockerfile
+++ b/Dockerfile
@@ -76,7 +76,8 @@ RUN easy_install -U pip && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python
 #For docstring checker
 RUN pip install pylint pytest astroid isort

--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+# Use UBUNTU_MIRROR can speed up apt-get speed.
+# ARG UBUNTU_MIRROR
+# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
-RUN pip install -U pip
-RUN pip install -U kubernetes paddlepaddle
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s
 ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+RUN pip install /*.whl && rm -f /*.whl 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD models/ /workspace/models/
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
        return train_program, fluid.default_startup_program()
    else:
        raise ValueError(
-            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    break
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
-            if args.update_method == "pserver":
-                exe.bcast_params()
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
          (num_samples, train_elapsed, examples_per_sec))
+def print_paddle_envs():
+    print('----------- Configuration envs -----------')
+    for k in os.environ:
+        if "PADDLE_" in k:
+            print "ENV %s:%s" % (k, os.environ[k])
+    print('------------------------------------------------')
 def main():
    args = parse_args()
    print_arguments(args)
+    print_paddle_envs()
    # the unique trainer id, starting from 0, needed by trainer
    # only

--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -17,6 +17,7 @@ import copy
 import argparse
 import random
 import os
+import copy
 from kube_templates import pserver, trainer, envs
@@ -108,10 +109,9 @@ def gen_job():
    tn_container["ports"][0]["containerPort"] = spreadport
    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
@@ -166,17 +166,23 @@ def gen_job():
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts
-    ps_container["env"] = envs
+    ps_container["env"] = copy.deepcopy(envs)
-    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    ps_container["env"].append({
+        "name": "PADDLE_TRAINING_ROLE",
+        "value": "PSERVER"
+    })
    tn_container["env"] = envs
    if args.disttype == "pserver":
        tn_container["env"].append({
-            "name": "TRAINING_ROLE",
+            "name": "PADDLE_TRAINING_ROLE",
            "value": "TRAINER"
        })
    elif args.disttype == "nccl2" or args.disttype == "local":
        # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "WORKER"
+        })
    os.mkdir(args.jobname)
    if args.disttype == "pserver":

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(
@@ -53,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
+    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+==========
+transpiler
+==========
+DistributeTranspiler
+--------------------
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+InferenceTranspiler
+-------------------
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+memory_optimize
+---------------
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+release_memory
+--------------
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+HashName
+--------
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+RoundRobin
+----------
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
 第二步，启动Parameter Server：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
 ```
 执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
 第三步，启动Trainer：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
 ```
 由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。

--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
           ret_list.append(f)
   return ret_list
-trainers = int(os.getenv("TRAINERS"))
+trainers = int(os.getenv("PADDLE_TRAINERS"))
-trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
 data_file = fluid.layers.io.open_files(
    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
    thread_num=1,

--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -13,6 +13,7 @@ cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
 cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
 ======================   ========================================
 从源码编译

--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包，可以用pip进行安装：
 保存并关闭文件。
 这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,4 +14,3 @@
 #
 add_subdirectory(inference)
-add_subdirectory(tape)
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -40,10 +40,9 @@ void Main(bool use_gpu) {
    //# 2. Prepare input.
    int64_t data[4] = {1, 2, 3, 4};
-    PaddleBuf buf{.data = data, .length = sizeof(data)};
    PaddleTensor tensor{.name = "",
                        .shape = std::vector<int>({4, 1}),
-                        .data = buf,
+                        .data = PaddleBuf(data, sizeof(data)),
                        .dtype = PaddleDType::INT64};
    // For simplicity, we set all the slots with the same data.
@@ -55,14 +54,12 @@ void Main(bool use_gpu) {
    //# 4. Get output.
    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
    }
-    // TODO(Superjomn): this is should be free automatically
-    free(outputs[0].data.data);
  }
 }
@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) {
      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
        // 2. Dummy Input Data
        int64_t data[4] = {1, 2, 3, 4};
-        PaddleBuf buf{.data = data, .length = sizeof(data)};
        PaddleTensor tensor{.name = "",
                            .shape = std::vector<int>({4, 1}),
-                            .data = buf,
+                            .data = PaddleBuf(data, sizeof(data)),
                            .dtype = PaddleDType::INT64};
        std::vector<PaddleTensor> inputs(4, tensor);
        std::vector<PaddleTensor> outputs;
@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) {
        // 4. Get output.
        ASSERT_EQ(outputs.size(), 1UL);
        LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length;
+                  << "output buffer size: " << outputs.front().data.length();
-        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        const size_t num_elements =
+            outputs.front().data.length() / sizeof(float);
        // The outputs' buffers are in CPU memory.
        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
        }
-        free(outputs[0].data.data);
      }
    });
  }

--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/contrib/inference/high_level_api.md
+# Inference High-level APIs
+This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application.
+The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed.
+## PaddleTensor
+We provide the `PaddleTensor` data structure is to give a general tensor interface.
+The definition is 
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`. 
+The `name` field is used to specify the name of input variable, 
+that is important when there are multiple inputs and need to distiuish which variable to set.
+## engine
+The inference APIs has two different underlying implementation, currently there are two valid engines:
+- the native engine, which is consists of the native operators and framework,
+- the Anakin engine, which is a Anakin library embeded.
+The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
+but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported.
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
+};
+```
+## PaddlePredictor and how to create one
+The main interface is `PaddlePredictor`, there are following methods 
+- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
+  - take inputs and output `output_data`
+- `Clone` to clone a predictor from an existing one, with model parameter shared.
+There is a factory method to help create a predictor, and the user takes the ownership of this object.
+```c++
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+```
+By specifying the engine kind and config, one can get an specific implementation.
+## Reference
+- [paddle_inference_api.h](./paddle_inference_api.h)
+- [demos](./demo)
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/contrib/inference/paddle_inference_api.h"
+namespace paddle {
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  assert(!other.memory_owned_);
+  data_ = other.data_;
+  length_ = other.length_;
+  memory_owned_ = other.memory_owned_;
+  return *this;
+}
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  assert(memory_owned_);
+  Free();
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #pragma once
+#include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
@@ -32,12 +33,38 @@ enum PaddleDType {
  INT64,
 };
-struct PaddleBuf {
+class PaddleBuf {
-  void* data;     // pointer to the data memory.
+ public:
-  size_t length;  // number of memory bytes.
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+  ~PaddleBuf() { Free(); }
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
 };
 struct PaddleTensor {
+  PaddleTensor() = default;
  std::string name;  // variable name.
  std::vector<int> shape;
  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
@@ -67,8 +94,9 @@ class PaddlePredictor {
  // Predict an record.
  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be alive until Run returns. caller should be
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for releasing the memory of `output_data`.
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
                   std::vector<PaddleTensor>* output_data) = 0;
@@ -82,7 +110,6 @@ class PaddlePredictor {
  // The common configs for all the predictors.
  struct Config {
    std::string model_dir;  // path to the model directory.
-    bool enable_engine{false};  // Enable to execute (part of) the model on
  };
 };

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
    auto d_tensor_in_p = executor_.get_in(input.name);
    float *d_data_p = d_tensor_in_p->mutable_data();
    if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data),
+                   static_cast<float *>(input.data.data()),
                   d_tensor_in_p->valid_size() * sizeof(float),
                   cudaMemcpyHostToDevice) != 0) {
      LOG(ERROR) << "copy data from CPU to GPU error";
@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
  for (auto &output : *output_data) {
    auto *tensor = executor_.get_out(output.name);
    output.shape = tensor->shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data,
+    if (cudaMemcpy(output.data.data(),
                   tensor->mutable_data(),
                   tensor->valid_size() * sizeof(float),
                   cudaMemcpyDeviceToHost) != 0) {

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -37,28 +37,26 @@ TEST(inference, anakin) {
  float data[1 * 3 * 224 * 224] = {1.0f};
-  PaddleBuf buf{.data = data, .length = sizeof(data)};
  PaddleTensor tensor{.name = "input_0",
                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = buf,
+                      .data = PaddleBuf(data, sizeof(data)),
                      .dtype = PaddleDType::FLOAT32};
  // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.emplace_back(std::move(tensor));
-  float data_out[1000];
-  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
  PaddleTensor tensor_out{.name = "prob_out",
                          .shape = std::vector<int>({1000, 1}),
-                          .data = buf_out,
+                          .data = PaddleBuf(),
                          .dtype = PaddleDType::FLOAT32};
-  std::vector<PaddleTensor> outputs(1, tensor_out);
+  std::vector<PaddleTensor> outputs;
+  outputs.emplace_back(std::move(tensor_out));
  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
-  float* data_o = static_cast<float*>(outputs[0].data.data);
+  float* data_o = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < 1000; ++j) {
    LOG(INFO) << "output[" << j << "]: " << data_o[j];
  }

--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
    std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data,
+                inputs[i].data.data(),
-                inputs[i].data.length);
+                inputs[i].data.length());
    feeds->push_back(input);
  }
  return true;
@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch(
    }
    outputs->at(i).shape = shape;
-    outputs->at(i).data.length = sizeof(float) * data.size();
+    auto &buffer = outputs->at(i).data;
-    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
+    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
-    std::memcpy(
+      buffer.Resize(sizeof(float) * data.size());
-        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    }
+    std::memcpy(buffer.data(), data.data(), buffer.length());
    outputs->at(i).dtype = PaddleDType::FLOAT32;
    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
  }

--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -27,13 +27,12 @@ namespace paddle {
 PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
  PaddleTensor pt;
-  pt.data.data = t->data<void>();
  if (t->type() == typeid(int64_t)) {
-    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
    pt.dtype = PaddleDType::INT64;
  } else if (t->type() == typeid(float)) {
-    pt.data.length = t->numel() * sizeof(float);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
    pt.dtype = PaddleDType::FLOAT32;
  } else {
    LOG(FATAL) << "unsupported type.";
@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
  std::vector<PaddleTensor> outputs;
  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
  ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
+  size_t len = outputs[0].data.length();
-  float* data = static_cast<float*>(outputs[0].data.data);
+  float* data = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < len / sizeof(float); ++j) {
    ASSERT_LT(data[j], 1.0);
    ASSERT_GT(data[j], -1.0);
@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
    EXPECT_LT(lod_data[i] - data[i], 1e-3);
    EXPECT_GT(lod_data[i] - data[i], -1e-3);
  }
-  free(outputs[0].data.data);
 }
 void MainImageClassification(bool use_gpu) {
@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
  std::vector<PaddleTensor> outputs;
  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
  ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
+  size_t len = outputs[0].data.length();
-  float* data = static_cast<float*>(outputs[0].data.data);
+  float* data = static_cast<float*>(outputs[0].data.data());
  float* lod_data = output1.data<float>();
  for (size_t j = 0; j < len / sizeof(float); ++j) {
    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
  }
-  free(data);
 }
 void MainThreadsWord2Vec(bool use_gpu) {
@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {
      // check outputs range
      ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
+      const size_t len = local_outputs[0].data.length();
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* data = static_cast<float*>(local_outputs[0].data.data());
      for (size_t j = 0; j < len / sizeof(float); ++j) {
        ASSERT_LT(data[j], 1.0);
        ASSERT_GT(data[j], -1.0);
@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
      for (int i = 0; i < refs[tid].numel(); ++i) {
        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
      }
-      free(data);
    });
  }
  for (int i = 0; i < num_jobs; ++i) {
@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {
      // check outputs correctness
      ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
+      const size_t len = local_outputs[0].data.length();
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* data = static_cast<float*>(local_outputs[0].data.data());
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
      for (int i = 0; i < refs[tid].numel(); ++i) {
        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
      }
-      free(data);
    });
  }
  for (int i = 0; i < num_jobs; ++i) {

--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
-# Dynamic Graph on Fluid
-PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
-challenging and we are still way from there. DyNet and PyTorch provide a good design
-idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
-a C++ API that is as convenient as Python but with higher efficiency and could
-conveniently integrate with industrial/production systems. This package, `tape`,
-combines the good of
-1. tape from PyTorch and DyNet
-2. C++ API and core from DyNet
-3. rich set of operators from PaddlePaddle
-## Overview
-We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
-by wrapping Paddle Fluid's `Operator` and `Variable`.
-The user API is straight forward since
-1. it is imperative. And it uses host language's control flow logic.
-1. it avoids extra concepts such as `Scope` and `Executor`.
-All of these benefits come at the cost of just adding one line `reset_global_tape`
-at every iteration.
-## Code Structure
-In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
-`type`, the pointers to the `Variable`s, and necessary attributes.
-```c++
-class Variable {
-public:
-  VriableHandle Grad(); // returns its gradient variable
-private:
-  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
-  framework::Variable var_; // run time variable, holds data memory
-};
-using VariableHandle = shared_ptr<Variable>;
-struct OpHandle {
-  string type_;
-  map<string, vector<VariableHandle>> inputs_;
-  map<string, vector<VariableHandle>> outputs_;
-  AttributeMap attrs_;
-};
-class Tape {
-public:
-  void AddOp(OpHandle); // add op
-  void Forward();       // execute the tape_
-  void Backward();      // execute the backward of the tape_
-private:
-  vector<OpHandle> tape_;
-};
-```
-We uses `Function` to indicate layers. It takes care of parameter
-initialization and `AddOp` to the Tape when it is called.
-```c++
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-    init_tape.Forward();
-  }
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(act_,
-                            {{"X", {pre_act}}},
-                            {{"Out", {post_act}}},
-                            {});
-    return post_act;
-  }
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-```
-## User API
-```c++
-// Model function
-paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
-paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
-paddle::tape::Mean mean;
-// Optimizer
-paddle::tape::SGD sgd(0.001);
-// Data Feeder
-paddle::tape::Fill data_feeder(...);
-VariableHandle input(new paddle::tape::Variable("input"));
-VariableHandle label(new paddle::tape::Variable("label"));
-for (int i = 0; i < 2; ++i) {
-  reset_global_tape();
-  data_feeder(input, label);
-  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
-  LOG(INFO) << loss.value(); // Run forward up to loss
-  // Run backward, store gradient of w at w->Grad()
-  get_global_tape.Backward(loss);
-  // Update w
-  sgd(linear1.Params());
-  sgd(linear2.Params());
-}
-```
-<details>
-  <summary></summary>
-digraph G {
-	subgraph cluster_0 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
-                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
-                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
-		linear1 -> elementwise_add1->relu1;
-		label = "forward tape";
-	}
-        linear1:before_mul1->before_mul1
-        linear1:weight1->weight1
-        linear1:before_bias1->before_bias1
-        elementwise_add1:bias1->bias1
-        elementwise_add1:before_bias1->before_bias1
-        elementwise_add1:before_act1->before_act1
-        relu1:before_act1->before_act1
-        relu1:after_act1->after_act1
-	subgraph cluster_1 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
-                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
-                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
-		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
-                label = "backward tape";
-	}
-        relu1_grad:after_act1_grad->after_act1_grad
-        relu1_grad:before_act1_grad->before_act1_grad
-        elementwise_add1_grad:before_act1_grad->before_act1_grad
-        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
-        elementwise_add1_grad:bias1_grad->bias1_grad
-        linear1_grad:before_mul1->before_mul1
-        linear1_grad:weight1->weight1
-        linear1_grad:before_bias1_grad->before_bias1_grad
-        linear1_grad:before_mul1_grad->before_mul1_grad
-        linear1_grad:weight1_grad->weight1_grad
-	subgraph cluster_2 {
-                node [shape=record];
-                label = "Linear1";
-                weight1
-                bias1
-	}
-        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
-        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
-}
-</details>
-![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
-## Code Reuse
-We want to stay close to Paddle Fluid as much as possible.
-### Reuse All Operators
-As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
-is about 10 lines of code, similar to expose an operator to Python.
-### Reuse Compile Time InferShape and InferVarType
-Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
-of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
-`InferVarType` every time we `AddOp` to the tape.
-### Reuse Operator::Run
-We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
-`Scope` for every `Operator::Run()`.
-## Possible Feature
-### Release Memory on Backward
-We can release memory aggressively. During backward, we can delete the OpHandle once
-we have finished its backward. Since all the variable is managed by smart pointer, the
-memory is automatically released when its `ref_count` goes to 0.
-### Kernel Fusion
-As a symbolic representation of the Tape is constructed first before the actual
-execution, it would be possible to perform graph optimization. One use case is kernel
-fusion.
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-#include "paddle/contrib/tape/tape.h"
-#include "paddle/contrib/tape/variable.h"
-#include "paddle/fluid/framework/type_defs.h"
-namespace paddle {
-namespace tape {
-class Function {};
-class Fill {
- public:
-  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
-      : initializer_(initializer), attrs_(attrs) {}
-  void operator()(VariableHandle var) {
-    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
-  }
- private:
-  const std::string initializer_;
-  const framework::AttributeMap attrs_;
-};
-class Mean {
- public:
-  VariableHandle operator()(VariableHandle var) {
-    VariableHandle out(new Variable("mean"));
-    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
-    return out;
-  }
-};
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-    init_tape.Forward();
-  }
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(
-        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
-    return post_act;
-  }
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-class SGD {
- public:
-  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
-    Tape init_tape;
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{1};
-    attrs["value"] = learning_rate;
-    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
-    init_tape.Forward();
-  }
-  void operator()(VariableHandle input) {
-    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
-                   "optimization must happen after the backward");
-    Tape temp_tape;
-    temp_tape.AddOp("sgd",
-                    {{"Param", {input}},
-                     {"LearningRate", {learning_rate_}},
-                     {"Grad", {input->Grad()}}},
-                    {{"ParamOut", {input}}},
-                    {});
-    temp_tape.Forward();
-  }
- private:
-  VariableHandle learning_rate_;
-};
-}
-}
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/contrib/tape/tape.h"
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/pybind/pybind.h"
-namespace paddle {
-namespace tape {
-// borrowed from
-// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
-inline bool ends_with(std::string const &value, std::string const &ending) {
-  if (ending.size() > value.size()) return false;
-  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-}
-std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
-  os << var_desc.Name();
-  os << "[" << var_desc.GetType() << "]";
-  os << "[" << var_desc.GetDataType() << "]";
-  os << "{";
-  for (auto &i : var_desc.GetShape()) {
-    os << i << ",";
-  }
-  os << "}";
-  return os;
-}
-std::string to_string(const std::string &type,
-                      const VariableHandleMap &in_vars,
-                      const VariableHandleMap &out_vars,
-                      const framework::AttributeMap &attrs) {
-  std::stringstream ss;
-  ss << type << " ";
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  return ss.str();
-}
-framework::OpDesc CreateOpDesc(const std::string &type,
-                               const VariableHandleMap &in_vars,
-                               const VariableHandleMap &out_vars,
-                               const framework::AttributeMap &attrs) {
-  framework::VariableNameMap inputs;
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      inputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  framework::VariableNameMap outputs;
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      outputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  return framework::OpDesc(type, inputs, outputs, attrs);
-}
-void InferShapeAndVarType(const std::string &type,
-                          const VariableHandleMap &in_vars,
-                          VariableHandleMap *out_vars,
-                          const framework::AttributeMap &attrs) {
-  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
-  // Create a temporary block for compile-time
-  framework::ProgramDesc program_desc;
-  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
-  PADDLE_ENFORCE(block_desc);
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
-  op_desc.InferShape(*block_desc);
-  op_desc.InferVarType(block_desc);
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
-    }
-  }
-  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
-}
-void Tape::AddOp(const std::string &type,
-                 const VariableHandleMap &in_vars,
-                 VariableHandleMap out_vars,
-                 const framework::AttributeMap &attrs) {
-  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
-  tape_.emplace_back(type, in_vars, out_vars, attrs);
-}
-// Temporary Scope for Operator::Run()
-class ScopeWrapper : public framework::Scope {
- public:
-  ScopeWrapper(const VariableHandleMap &in_vars,
-               const VariableHandleMap &out_vars) {
-    for (auto &v : in_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-    for (auto &v : out_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-  }
-  ~ScopeWrapper() {
-    for (auto &pair : vars_) {
-      pair.second.release();
-    }
-  }
-};
-void Tape::Forward() {
-  LOG(INFO) << "Starting forward -------------------------";
-  PADDLE_ENFORCE(!has_been_backwarded_);
-  while (current_position_ < tape_.size()) {
-    OpHandle &op = tape_[current_position_];
-    // Create Output Tensor, this is only necessary for OpWithKernel
-    for (auto &param2var : op.outputs_) {
-      for (auto &var : param2var.second) {
-        var->InitializeVariable();
-      }
-    }
-    framework::OpDesc op_desc =
-        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
-    ScopeWrapper scope(op.inputs_, op.outputs_);
-    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
-    current_position_++;
-  }
-  LOG(INFO) << "Finishing forward -------------------------";
-}
-void Tape::Backward(VariableHandle target) {
-  PADDLE_ENFORCE(!has_been_backwarded_);
-  Forward();
-  // TODO(tonyyang-svail): check output of last op is target
-  backward_tape_.reset(new Tape());
-  framework::AttributeMap attrs;
-  // FIXME(tonyyang-svail): Need to infer_data_type
-  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{1};
-  attrs["value"] = 1.0f;
-  backward_tape_->AddOp(
-      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
-  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
-    framework::OpDesc op_desc =
-        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
-    std::unordered_map<std::string, std::string> grad_to_var;
-    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
-        framework::OpInfoMap::Instance()
-            .Get(op_desc.Type())
-            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
-    for (auto &op_desc : grad_op_descs) {
-      std::unordered_map<std::string, VariableHandle> name2var;
-      for (auto &param2vars : it->inputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-      for (auto &param2vars : it->outputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-      VariableHandleMap in_vars;
-      VariableHandleMap out_vars;
-      std::map<const framework::VariableNameMap *, VariableHandleMap *>
-          loop_over{{&op_desc->Inputs(), &in_vars},
-                    {&op_desc->Outputs(), &out_vars}};
-      for (auto &each : loop_over) {
-        auto &vmp = *each.first;
-        auto &vhm = *each.second;
-        for (auto &p2a : vmp) {
-          for (auto &argu : p2a.second) {
-            if (name2var.count(argu)) {
-              vhm[p2a.first].push_back(name2var[argu]);
-            } else {
-              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
-                             argu.c_str());
-              std::string name = argu.substr(
-                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
-              PADDLE_ENFORCE(name2var.count(name), name.c_str());
-              vhm[p2a.first].push_back(name2var[name]->Grad());
-            }
-          }
-        }
-      }
-      backward_tape_->AddOp(
-          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
-    }
-    // TODO(tonyyang-svail): how to fill empty grad?
-    // TODO(tonyyang-svail): Sum var grad is necessary
-  }
-  backward_tape_->Forward();
-  has_been_backwarded_ = true;
-}
-Tape &get_global_tape() {
-  static Tape T;
-  return T;
-}
-void reset_global_tape() { get_global_tape() = Tape(); }
-}
-}
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/contrib/tape/variable.h"
-namespace paddle {
-namespace tape {
-using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
-struct OpHandle {
-  OpHandle(const std::string &type,
-           const VariableHandleMap &in_vars,
-           const VariableHandleMap &out_vars,
-           const framework::AttributeMap &attrs)
-      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
-  std::string type_;
-  VariableHandleMap inputs_;
-  VariableHandleMap outputs_;
-  framework::AttributeMap attrs_;
-};
-class Tape {
- public:
-  void AddOp(const std::string &type,
-             const VariableHandleMap &in_vars,
-             VariableHandleMap out_vars,
-             const framework::AttributeMap &attrs);
-  void Forward();
-  void Backward(VariableHandle target);
-  bool HasBeenBackwarded() { return has_been_backwarded_; }
- private:
-  bool has_been_backwarded_ = false;
-  size_t current_position_ = 0;
-  std::vector<OpHandle> tape_;
-  std::shared_ptr<Tape> backward_tape_;
-};
-Tape &get_global_tape();
-void reset_global_tape();
-}
-}
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "gtest/gtest.h"
-#include "paddle/contrib/tape/function.h"
-using namespace paddle::tape;
-TEST(Tape, TestMLP) {
-  LOG(INFO) << "TestMLP";
-  Linear linear1(3, 3, "relu");
-  Linear linear2(3, 3, "relu");
-  Mean mean;
-  SGD sgd(0.001);
-  std::string initializer = "fill_constant";
-  paddle::framework::AttributeMap attrs;
-  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{3, 3};
-  attrs["value"] = 1.0f;
-  Fill filler(initializer, attrs);
-  for (int i = 0; i < 2; ++i) {
-    reset_global_tape();
-    VariableHandle input(new Variable("input"));
-    filler(input);
-    auto loss = mean(linear2(linear1(input)));
-    get_global_tape().Backward(loss);
-    for (auto w : linear1.Params()) {
-      sgd(w);
-    }
-    for (auto w : linear2.Params()) {
-      sgd(w);
-    }
-  }
-}
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-  places.emplace_back(paddle::platform::CPUPlace());
-  paddle::platform::DeviceContextPool::Init(places);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/contrib/tape/variable.h"
-namespace paddle {
-namespace tape {
-void Variable::InitializeVariable() {
-  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
-  framework::proto::VarType::Type var_type = desc_.GetType();
-  if (var_type == framework::proto::VarType::LOD_TENSOR) {
-    var_.GetMutable<framework::LoDTensor>();
-  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
-    var_.GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
-                 var_type);
-  }
-}
-}
-}
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <memory>
-#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/variable.h"
-namespace paddle {
-namespace tape {
-class Variable;
-using VariableHandle = std::shared_ptr<Variable>;
-/*
- * Combination of
- *     framework::VarDesc desc_;
- *     framework::Variable var_;
- */
-class Variable {
- public:
-  Variable(const std::string pre_fix)
-      : desc_(pre_fix + std::to_string(count())) {}
-  Variable(const std::string pre_fix, bool is_grad)
-      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
-                                 : std::to_string(count()))) {}
-  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
-  // Instantiate LoDTensor/SelectedRow
-  void InitializeVariable();
-  VariableHandle Grad() {
-    if (grad_.expired()) {
-      VariableHandle new_grad(new Variable(desc_.Name(), true));
-      grad_ = new_grad;
-      return new_grad;
-    } else {
-      return VariableHandle(grad_);
-    }
-  }
-  // Stochastic Gradient Descent with Momentum
-  //  VariableHandle Momentum ();
-  //  void init(const std::string& initializer,
-  //            const framework::AttributeMap& attrs);
-  // void value() {};
-  const framework::VarDesc& Desc() const { return desc_; }
-  framework::VarDesc* MutableDesc() { return &desc_; }
-  // TODO(tonyyang-svail): No need to expose name
-  std::string Name() const { return desc_.Name(); }
-  framework::Variable* Var() { return &var_; }
- private:
-  int count() {
-    static int counter = 0;
-    return counter++;
-  }
-  framework::VarDesc desc_;
-  framework::Variable var_;
-  std::weak_ptr<Variable> grad_;
-};
-}
-}
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -73,6 +73,9 @@ void BroadcastOpHandle::RunImpl() {
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
    std::vector<std::function<void()>> broadcast_calls;
+    int type = platform::ToNCCLDataType(in_tensor.type());
+    size_t numel = static_cast<size_t>(in_tensor.numel());
    for (auto out_var_handle : out_var_handles) {
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
                              ->FindVar(out_var_handle->name_);
@@ -87,13 +90,11 @@ void BroadcastOpHandle::RunImpl() {
        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
        out_handle = out_var_handle;
      } else {
-        send_recv_buffer =
+        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
-            VariableVisitor::GetMutableTensor(out_var).mutable_data(
+                               .Resize(in_tensor.dims())
-                out_var_handle->place_);
+                               .mutable_data(out_var_handle->place_);
      }
-      int type = platform::ToNCCLDataType(in_tensor.type());
-      size_t numel = static_cast<size_t>(in_tensor.numel());
      broadcast_calls.emplace_back(
          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
            PADDLE_ENFORCE(platform::dynload::ncclBcast(

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  for (auto &p : params) {
    grad_names_.insert(GradVarName(p));
  }
+  balance_vars_.resize(places_.size(), 0);
 }
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
         checker(op.InputArgumentNames(), recv_vars);
 }
+size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
-  std::unordered_map<std::string, VarDesc *> all_vars;
  for (auto *var : program.Block(0).AllVars()) {
-    all_vars[var->Name()] = var;
+    all_vars_.emplace(var->Name(), var);
  }
  auto graph = new SSAGraph();
@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  auto send_vars = FindDistTrainSendVars(program);
  auto recv_vars = FindDistTrainRecvVars(program);
-  std::vector<std::unordered_set<std::string>> var_name_on_devices;
  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
-  var_name_on_devices.resize(places_.size());
  bcast_var_name_set.resize(places_.size());
  size_t cur_device_id = 0;
-  std::vector<int64_t> balance_grads(places_.size(), 0);
-  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
-    auto var_desc = all_vars.at(g_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GE(numel, 0);
-    auto smallest =
-        std::min_element(std::begin(balance_grads), std::end(balance_grads));
-    size_t dev_id =
-        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
-    balance_grads[dev_id] += numel;
-    return dev_id;
-  };
  bool is_forwarding = true;
  for (auto *op : program.Block(0).AllOps()) {
    if (boost::get<int>(
            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
        static_cast<int>(OpRole::kRPC)) {
-      // append rpc op if program is distributed trainer main program.
-      // always use the first device
      CreateRPCOp(&result, *op);
    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
      CreateDistTrainOp(&result, *op);
@@ -199,15 +200,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
          BuildStrategy::GradientScaleStrategy::kCustomized) {
        CreateScaleLossGradOp(&result);
      }
+      // This assumes the backward generating code will ensure IsScaleLossOp
+      // is true only for the op that scale the final scalar loss.
+      // It also assumes backward op will always follow the forward op in
+      // the block.
      is_forwarding = false;
    } else {
-      int op_dev_id = GetOpDeviceID(var_name_on_devices, *op);
+      int op_dev_id = GetOpDeviceID(*op);
      if (op_dev_id == -1) {  // var on all device
        CreateComputationalOps(&result, *op, places_.size());
      } else {
        CreateComputationalOp(&result, *op, op_dev_id);
        for (auto &var_name : op->OutputArgumentNames()) {
-          var_name_on_devices[op_dev_id].emplace(var_name);
+          var_name_on_devices_.emplace(var_name, op_dev_id);
        }
      }
      if (!is_forwarding && places_.size() > 1) {
@@ -230,19 +235,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
              switch (strategy_.reduce_) {
                case BuildStrategy::ReduceStrategy::kReduce:
-                  cur_device_id = get_appropriate_dev(g_name);
+                  cur_device_id = GetAppropriateDeviceID({g_name});
                  CreateReduceOp(&result, g_name, cur_device_id);
-                  var_name_on_devices[cur_device_id].emplace(g_name);
+                  var_name_on_devices_.emplace(g_name, cur_device_id);
                  bcast_var_name_set[cur_device_id].emplace(p_name);
                  break;
                case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(all_vars, g_name)) {
+                  if (IsSparseGradient(g_name)) {
                    CreateReduceOp(&result, g_name, 0);
                    CreateBroadcastOp(&result, g_name, 0);
                  } else {
                    InsertAllReduceOp(&result, g_name);
                  }
                  break;
+                default:
+                  LOG(FATAL) << "Unknown reduce strategy ";
+                  break;
              }
            }
          } catch (boost::bad_get e) {
@@ -261,7 +269,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  }
  /*
    Dependency graph has been constructed. However, there are still data
-    harzaeds need to be handled.
+    hazards need to be handled.
   */
  PolishGraphToSupportDataHazards(&result);
@@ -273,11 +281,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  return std::unique_ptr<SSAGraph>(graph);
 }
-bool MultiDevSSAGraphBuilder::IsSparseGradient(
+bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
-    const std::unordered_map<std::string, VarDesc *> &all_vars,
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-    const std::string &og) const {
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-  PADDLE_ENFORCE(all_vars.count(og) != 0);
-  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
    return true;
  }
  return false;
@@ -345,7 +351,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());
-    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    auto var = new VarHandle(vars.size(), i, og, p);
    vars.emplace_back(var);
    op_handle->AddOutput(var);
  }
@@ -363,24 +369,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
  return is_pg_once;
 }
-int MultiDevSSAGraphBuilder::GetOpDeviceID(
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
-    const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
-    const OpDesc &op) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
    return -1;
  }
-  int var_dev_id = -1;
+  for (auto &varname : op.InputArgumentNames()) {
-  for (auto &var_name : op.InputArgumentNames()) {
+    int dev_id = GetVarDeviceID(varname);
-    if (var_dev_id != -1) break;
+    if (dev_id != -1) {
-    for (size_t i = 0; i < var_name_on_devices.size(); ++i) {
+      return dev_id;
-      if (var_name_on_devices[i].count(var_name)) {
-        var_dev_id = static_cast<int>(i);
-        break;
-      }
    }
  }
-  return var_dev_id;
+  return -1;
+}
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
+  auto got = var_name_on_devices_.find(varname);
+  return got == var_name_on_devices_.end() ? -1 : got->second;
 }
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
@@ -442,13 +447,14 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
    op_handle->AddInput(prev_grad.get());
  }
  auto &vars = result->vars_[dst_dev_id][og];
-  auto var =
+  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
-      new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]);
  vars.emplace_back(var);
  op_handle->AddOutput(var);
  return var;
 }
+// Find the first occurence of `prev_op_name` and make current `op` depend
+// on it.
 void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
                                        const std::string &prev_op_name) const {
  for (auto &prev_op : result->ops_) {
@@ -463,16 +469,66 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                const OpDesc &op) const {
-  CreateComputationalOp(result, op, 0);
+  int op_dev_id = -1;
+  if (op.Type() == "split_byref") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else if (op.Type() == "concat") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+  } else {
+    PADDLE_ENFORCE(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s", op.Type());
+  CreateComputationalOp(result, op, op_dev_id);
  if (op.Type() == "concat") {
    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
  }
 }
+// Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
                                          const OpDesc &op) const {
-  result->ops_.emplace_back(
+  int op_dev_id = -1;
-      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
+  if (op.Type() == "send") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    // the variable name which contains .block means it was splited by
+    // split_byref op
+    // so that we can balance the variable blocks to all the pserver instances.
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
+        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+  } else if (op.Type() == "recv") {
+    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else {
+    // send_barrier and fetch_barrier op can be scheduled on device 0
+    op_dev_id = 0;
+  }
+  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
+                 op.Type());
+  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
+                                            op.Type(), places_[op_dev_id]));
  if (op.Type() == "send_barrier") {
    ConnectOp(result, result->ops_.back().get(), "send");
@@ -488,9 +544,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
        "send, send_barrier. recv, fetch_barrier]");
  }
-  // TODO(Yancey1989): schedule rpc op on different place may
+  CreateOpHandleIOs(result, op, op_dev_id);
-  // increate throughput
-  CreateOpHandleIOs(result, op, 0);
 }
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif
  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+  int GetVarDeviceID(const std::string &varname) const override;
 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t place_id) const;
+                         size_t device_id) const;
 private:
  std::string loss_var_name_;
@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
      const std::string &og,
      std::unordered_set<std::string> *og_has_been_broadcast) const;
-  int GetOpDeviceID(
+  int GetOpDeviceID(const OpDesc &op) const;
-      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
-      const OpDesc &op) const;
  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                         size_t src_dev_id) const;
-  bool IsSparseGradient(
+  bool IsSparseGradient(const std::string &og) const;
-      const std::unordered_map<std::string, VarDesc *> &all_vars,
-      const std::string &og) const;
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
 private:
  BuildStrategy strategy_;
+  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+  mutable std::unordered_map<std::string, int> var_name_on_devices_;
+  mutable std::vector<int64_t> balance_vars_;
  void SetCommunicationContext(OpHandleBase *op_handle,
                               const platform::Place &p) const;

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>
 namespace paddle {
 namespace framework {
@@ -122,11 +122,16 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
+    // NOTE(zcd): device context must be ordered here because RecordEvent
+    // will use a mutex to ensure the safe of multi-threads.
+    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
    for (auto &p : dev_ctxes_) {
+      ordered_ctxes.emplace(p.second, p.first);
+    }
+    for (auto &p : ordered_ctxes) {
      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
            method);
      };
    }

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,6 +30,7 @@ class SSAGraphBuilder {
  SSAGraphBuilder() {}
  virtual ~SSAGraphBuilder() {}
  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+  virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
    if (timeout) {
+      std::lock_guard<std::mutex> l(exception_mu_);
      if (exception_) {
        auto exp = *exception_;
        exception_.reset();
@@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp(
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EnforceNotMet ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
      exception_.reset(new platform::EnforceNotMet(ex));
    } catch (...) {
      LOG(FATAL) << "Unknown exception catched";

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
  platform::DeviceContextPool fetch_ctxs_;
+  std::mutex exception_mu_;
  std::unique_ptr<platform::EnforceNotMet> exception_;
  std::atomic<int> running_ops_;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
 #ifdef PADDLE_WITH_DISTRIBUTE
 void Executor::Complete() {
-  ::paddle::operators::detail::RPCClient::GetInstance<
+  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::detail::GRPCClient>()
+      ::paddle::operators::distributed::GRPCClient>()
      ->SendComplete();
 }
 #endif
@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id) {
-  auto* ctx = new ExecutorPrepareContext(program, block_id);
+  std::unique_ptr<ExecutorPrepareContext> ctx(
+      new ExecutorPrepareContext(program, block_id));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
-  return std::unique_ptr<ExecutorPrepareContext>(ctx);
+  return ctx;
 }
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
@@ -320,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 }
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars) {
+                                  bool create_local_scope, bool create_vars,
+                                  bool keep_kids) {
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
@@ -343,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    }
  }
  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  if (create_vars && create_local_scope) {
+  if (local_scope != scope) {
    scope->DeleteScope(local_scope);
  } else {
-    // Delete the local scopes created in operators.
+    if (!keep_kids) {
+      // By default, we should delete all kid scopes after run executor because
+      // some operators may create local scope when running, such as while_op.
+      // But when while_op also create a local executor to run it's sub block,
+      // the sub scopes it created should not be dropped immediately, because
+      // while_grad_op will use some variables created during while_op run, so
+      // we need to keep the kids and wait for the outer executor to drop them.
      scope->DropKids();
    }
+  }
  if (FLAGS_benchmark) {
    VLOG(2) << "-------------------------------------------------------";
    VLOG(2) << "Memory used after deleting local scope: "

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -78,7 +78,7 @@ class Executor {
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
-                          bool create_vars = true);
+                          bool create_vars = true, bool keep_kids = false);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          std::map<std::string, const LoDTensor*>* feed_targets,

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -27,6 +27,7 @@ enum AttrType {
  BOOLEANS = 7;
  BLOCK = 8;
  LONG = 9;
+  BLOCKS = 10;
 }
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +47,7 @@ message OpDesc {
    repeated bool bools = 11;
    optional int32 block_idx = 12;
    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
  };
  message Var {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
  if (!platform::is_cpu_place(t.place())) {
    LoDTensor tt;
    framework::TensorCopy(t, platform::CPUPlace(), &tt);
@@ -70,7 +68,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
  // only print first ten elements
  int64_t size = t.numel() < 10 ? t.numel() : 10;
  for (int64_t i = 0; i < size; ++i) {
+    if (t.type().hash_code() == typeid(float).hash_code()) {
      os << t.data<float>()[i] << " ";
+    } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
  }
  return os;

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -26,6 +26,20 @@
 namespace paddle {
 namespace framework {
+TEST(LoD, PrintLoDTensor) {
+  LoDTensor tensor1;
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor1.data<float>()[0] = 0.2;
+  tensor1.data<float>()[1] = 0.5;
+  LOG(INFO) << tensor1;
+  LoDTensor tensor2;
+  tensor2.mutable_data<int64_t>(platform::CPUPlace());
+  tensor2.data<int64_t>()[0] = 1;
+  tensor2.data<int64_t>()[1] = 2;
+  LOG(INFO) << tensor2;
+}
 TEST(LoD, data) {
  LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@@ -37,7 +51,7 @@ TEST(LoD, data) {
  }
 }
-TEST(LodExpand, test) {
+TEST(LoD, ExpandLoD) {
  LoD lod{{0, 2}};
  LoDTensor tensor;
  tensor.set_lod(lod);

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
  need_update_ = true;
 }
+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name] = blocks;
+  need_update_ = true;
+}
 void OpDesc::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
@@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
+  void operator()(const std::vector<BlockDesc *> &v) const {
+    std::vector<int> blocks_idx;
+    for (auto blk : v) {
+      blocks_idx.push_back(blk->ID());
+    }
+    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
+  }
  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
  void operator()(int64_t v) const { attr_->set_l(v); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -77,6 +77,8 @@ class OpDesc {
  void SetBlockAttr(const std::string &name, BlockDesc *block);
+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
  Attribute GetAttr(const std::string &name) const;
  Attribute GetNullableAttr(const std::string &name) const;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
  // ncclOp
  details::SSAGraphBuilderFactory builder_factory(
      member_->places_, loss_var_name, params, member_->local_scopes_,
      build_strategy);
@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
 #endif
  }
+  builder_ = builder_factory.Create();
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
-      builder_factory.Create()->Build(main_program)));
+      builder_->Build(main_program)));
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor(
 void ParallelExecutor::BCastParamsToGPUs(
    const std::unordered_set<std::string> &vars) const {
-  auto *main_scope = member_->local_scopes_[0];
+  // the the initialize bcast, all vars would be bcast from device(0), otherwise
+  // bcast from the specified device.
+  bool initialize = builder_.get() == nullptr ? true : false;
  for (auto &var : vars) {
-    auto *main_var = main_scope->FindVar(var);
+    int var_dev_id =
+        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    if (!initialize && var_dev_id == -1) continue;
+    framework::Variable *main_var = nullptr;
+    if (initialize) {
+      main_var = member_->local_scopes_[0]->FindVar(var);
+    } else {
+      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
+    }
    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
      continue;
    }
@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs(
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
-        if (i == 0) {
+        if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -19,12 +19,14 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
@@ -68,6 +70,7 @@ class ParallelExecutor {
 private:
  ParallelExecutorPrivate *member_;
+  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -21,6 +21,8 @@
 * big.
 */
+#pragma once
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
@@ -43,7 +45,7 @@ struct Argument {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
-  if (!UNLIKELY(field__)) {                                  \
+  if (UNLIKELY(!(field__))) {                                \
    LOG(ERROR) << "field " << #field__ << " should be set."; \
    return false;                                            \
  }

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
  SubGraphFuse(graph, node_inside_subgraph_teller_);
 }
-}  // analysis
+}  // namespace analysis
-}  // inference
+}  // namespace inference
-}  // paddle
+}  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,8 +184,8 @@ else()
    set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
-add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
+    add_subdirectory(distributed)
    set(DISTRIBUTE_DEPS "")
    if(WITH_GRPC)
@@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE)
    endif()
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
+    foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
-    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
-    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endforeach()
-    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
-    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
    #        listen_and_serv_op sum_op executor SERIAL)

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -12,16 +12,20 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 namespace paddle {
 namespace operators {
-using paddle::framework::Tensor;
+using framework::DataLayout;
-using paddle::platform::MKLDNNDeviceContext;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
 namespace {
 std::string gethash(const mkldnn::memory::dims &operand_dims,
@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
  };
  return dim2str(operand_dims) + std::to_string(algorithm);
 }
+}  // namespace
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
-template <typename T, typename ExecContext>
+template <typename Functor>
-void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+class MKLDNNActivationGradKernel
-                     const T alpha = 0, const T beta = 0) {
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+template <typename T>
+void eltwise_forward(const framework::ExecutionContext &ctx,
+                     mkldnn::algorithm algorithm, const T alpha = 0,
+                     const T beta = 0) {
  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                 "It must use CPUPlace.");
  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
  const auto &mkldnn_engine = dev_ctx.GetEngine();
-  // get buffers
+  const auto *x = ctx.Input<Tensor>("X");
-  const auto *src = ctx.template Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
-  const auto *src_data = src->template data<T>();
-  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *x_data = x->data<T>();
-  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
-  // get memory dim
+  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
-  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
                 "Input dim must be with 2 or 4");
-  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
  const std::string key = gethash(src_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  const std::string key_src_layout =
-  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
-  const std::string key_fwd = key + "@eltwise_fwd";
+  const std::string key_with_layout = key + std::to_string(src_format);
+  const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
+  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+  // save input data and layout to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(x_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+  auto p_src_layout = std::make_shared<memory::format>(src_format);
+  dev_ctx.SetBlob(key_src_layout, p_src_layout);
  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
      dev_ctx.GetBlob(key_fwd));
-  // save input data to be referred in backward path
+  std::shared_ptr<memory> dst_memory;
-  auto p_src_data = std::make_shared<const T *>(src_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
  if (p_fwd == nullptr) {
-    // create memory description
+    // create mkldnn memory for input X
-    auto data_md = src_tz.size() == 2
+    auto src_md = platform::MKLDNNMemDesc(
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
-                                                 mkldnn::memory::format::nc)
+    auto src_memory = std::shared_ptr<memory>(
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
-                                                 mkldnn::memory::format::nchw);
+    // save src_memory to be referred in backward path
+    dev_ctx.SetBlob(key_src_mem, src_memory);
-    // create memory primitives
-    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+    // create primitive descriptor for activation forward and save it
-        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
+    auto forward_desc = mkldnn::eltwise_forward::desc(
-    dev_ctx.SetBlob(key_src_mem, p_src_mem);
+        mkldnn::prop_kind::forward_training, algorithm,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
-    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
+        forward_desc, mkldnn_engine);
-    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
+    // save prim desc into global device context to be referred in backward path
-    auto fwd_desc = mkldnn::eltwise_forward::desc(
+    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
-        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+    // create mkldnn memory for output y
-        fwd_desc, mkldnn_engine);
+    dst_memory =
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
+        std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
-    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
-    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
+    dev_ctx.SetBlob(key_dst_mem, dst_memory);
-        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    // create activation primitive
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
+                                                      *dst_memory);
    dev_ctx.SetBlob(key_fwd, p_fwd);
  } else {
    // primitives already exist
-    auto p_src_mem =
+    auto src_memory =
        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-    PADDLE_ENFORCE(p_src_mem != nullptr,
+    PADDLE_ENFORCE(src_memory != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+                   "Fail to find eltwise src_memory in device context.");
-    auto p_dst_mem =
+    dst_memory =
        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
-    PADDLE_ENFORCE(p_dst_mem != nullptr,
+    PADDLE_ENFORCE(dst_memory != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+                   "Fail to find eltwise dst_memory in device context.");
-    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
+    src_memory->set_data_handle(platform::to_void_cast(x_data));
-    p_dst_mem->set_data_handle(dst_data);
+    dst_memory->set_data_handle(y_data);
  }
  // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
+  std::vector<primitive> pipeline;
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  pipeline.push_back(*p_fwd);
+  stream(stream::kind::eager).submit(pipeline).wait();
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
 }
-template <typename T, typename ExecContext>
+template <typename T>
-void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+void eltwise_grad(const framework::ExecutionContext &ctx,
-                  const T alpha = 0, const T beta = 0) {
+                  mkldnn::algorithm algorithm, const T alpha = 0,
+                  const T beta = 0) {
  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
  const auto &mkldnn_engine = dev_ctx.GetEngine();
-  // get buffers
+  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  const auto *out = ctx.template Input<Tensor>("Out");
+  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-  const auto *diff_dst = dout->template data<T>();
-  auto *dx =
+  const T *diff_y_data = diff_y->data<T>();
-      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
-  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
-  // get memory dim
+  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
-  std::vector<int> src_tz = framework::vectorize2int(out->dims());
-  const std::string key = gethash(src_tz, algorithm);
+  auto diff_y_format =
-  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
-  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
-  const std::string key_grad = key + "@eltwise_grad";
+  const std::string key = gethash(diff_dst_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
+  const auto p_src_layout =
+      std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
+  const std::string key_src_mem =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
+  const std::string key_fwd_pd =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_diff_src_mem =
+      key_with_layouts + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem =
+      key_with_layouts + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key_with_layouts + "@eltwise_grad";
  const auto p_src_data =
      std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  auto src_memory =
-  auto p_src_mem =
      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-  p_src_mem->set_data_handle(*p_src_data.get());
+  PADDLE_ENFORCE(src_memory != nullptr,
+                 "Fail to find src_memory in device context");
+  src_memory->set_data_handle(*p_src_data.get());
+  std::shared_ptr<memory> diff_src_memory;
-  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
      dev_ctx.GetBlob(key_grad));
  if (p_grad == nullptr) {
-    // create memory description
+    // create mkldnn memory for input diff_y
-    auto data_md = src_tz.size() == 2
+    auto diff_dst_md = platform::MKLDNNMemDesc(
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
-                                                 mkldnn::memory::format::nc)
+    auto diff_dst_memory = std::shared_ptr<memory>(
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
-                                                 mkldnn::memory::format::nchw);
+    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
-    // create memory primitives
+    // retrieve eltwise primitive desc from device context
-    std::shared_ptr<void> p_diff_src_mem =
+    auto forward_pd =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
+            dev_ctx.GetBlob(key_fwd_pd));
-    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
+    PADDLE_ENFORCE(forward_pd != nullptr,
-    std::shared_ptr<void> p_diff_dst_mem =
+                   "Fail to find eltwise_fwd_pd in device context");
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
+    // ceate primitive descriptor for activation backward
-    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
+    auto backward_desc = mkldnn::eltwise_backward::desc(
+        algorithm, diff_dst_memory->get_primitive_desc().desc(),
-    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
-                                                   alpha, beta);
+    auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
+        backward_desc, mkldnn_engine, *forward_pd);
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
+    // create mkldnn memory for output diff_src
-        dev_ctx.GetBlob(key_fwd_pd).get());
+    diff_src_memory = std::make_shared<memory>(
+        backward_pd.diff_src_primitive_desc(), diff_x_data);
-    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+    dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
-        bwd_desc, mkldnn_engine, *p_fwd_pd);
+    // create activation backward primitive
    p_grad = std::make_shared<mkldnn::eltwise_backward>(
-        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
+        backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
-        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
+    dev_ctx.SetBlob(key_grad, p_grad);
-        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
  } else {
    // primitives already exist
-    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+    diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
        dev_ctx.GetBlob(key_diff_src_mem));
-    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+    auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
        dev_ctx.GetBlob(key_diff_dst_mem));
-    p_diff_src_mem->set_data_handle(
+    diff_src_memory->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_src));
+        platform::to_void_reinterpret_cast(diff_x_data));
-    p_diff_dst_mem->set_data_handle(
+    diff_dst_memory->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_dst));
+        platform::to_void_reinterpret_cast(diff_y_data));
  }
  // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
+  std::vector<primitive> pipeline;
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  pipeline.push_back(*p_grad);
+  stream(stream::kind::eager).submit(pipeline).wait();
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
 }
-}  // anonymous namespace
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
+  void operator()(const framework::ExecutionContext &ctx) const {
-  void operator()(const ExecContext &ctx) const {
    eltwise_forward<T>(ctx, algorithm);
  }
 };
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
+  void operator()(const framework::ExecutionContext &ctx) const {
-  void operator()(const ExecContext &ctx) const {
    eltwise_grad<T>(ctx, algorithm);
  }
 };

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+using paddle::framework::Tensor;
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
  class OP_NAME##OpMaker                                                \
      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
@@ -27,9 +29,9 @@ namespace operators {
      AddInput("X", "Input of " #OP_NAME " operator");                  \
      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
      AddAttr<bool>("use_mkldnn",                                       \
-                    "(default false) Only used in mkldnn kernel")     \
+                    "(bool, default false) Only used in mkldnn kernel") \
          .SetDefault(false);                                           \
-      AddComment(OP_COMMENT);                                         \
+      AddComment(#OP_COMMENT);                                          \
    }                                                                   \
  }
@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                      const framework::OperatorWithKernel& oper,
                                      const std::string& name) {
  framework::LibraryType library{framework::LibraryType::kPlain};
  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
  auto it = oper.Attrs().find("use_mkldnn");
@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return GetKernelType(ctx, *this, "X");
@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
  }
+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return GetKernelType(ctx, *this, "Out");
@@ -140,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 )DOC";
@@ -382,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 STanh Activation Operator.
-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 )DOC");
  }

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -21,8 +21,6 @@ namespace operators {
 using batch_norm_bwd = mkldnn::batch_normalization_backward;
 using batch_norm_fwd = mkldnn::batch_normalization_forward;
-using framework::DataLayout;
-using framework::Tensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::reorder;
@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 namespace {
 template <typename T>
 struct bn_type_traits {

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -22,22 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 class BatchNormOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -19,6 +19,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
                  ops::BilinearInterpOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
-REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
+                       ops::BilinearInterpKernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
                       ops::BilinearInterpGradKernel<float>);
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
    int in_chw = channels * in_hw;
    int out_chw = channels * out_hw;
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_h =
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
    if (in_h == out_h && in_w == out_w) {
      memcpy(output, input, input_t->numel() * sizeof(T));
@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
        for (int i = 0; i < out_h; ++i) {     // loop for images
          int h = ratio_h * i;
          int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
+          float h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h2lambda = 1.f - h1lambda;
          for (int j = 0; j < out_w; ++j) {
            int w = ratio_w * j;
            int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
+            float w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w2lambda = 1.f - w1lambda;
            // calculate four position for bilinear interpolation
            const T* in_pos = &input[k * in_chw + h * in_w + w];
            T* out_pos = &output[k * out_chw + i * out_w + j];
            for (int c = 0; c < channels; ++c) {  // loop for channels
              // bilinear interpolation
-              out_pos[0] =
+              out_pos[0] = static_cast<T>(
                  h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
                  h1lambda * (w2lambda * in_pos[hid * in_w] +
-                              w1lambda * in_pos[hid * in_w + wid]);
+                              w1lambda * in_pos[hid * in_w + wid]));
              in_pos += in_hw;
              out_pos += out_hw;
            }
@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
    int in_chw = channels * in_hw;
    int out_chw = channels * out_hw;
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_h =
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
    if (in_h == out_h && in_w == out_w) {
      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
        for (int i = 0; i < out_h; ++i) {     // loop for images
          int h = ratio_h * i;
          int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
+          float h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h2lambda = 1 - h1lambda;
          for (int j = 0; j < out_w; ++j) {
            int w = ratio_w * j;
            int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
+            float w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w2lambda = 1 - w1lambda;
            T* in_pos = &d_input[k * in_chw + h * in_w + w];
            const T* out_pos = &d_output[k * out_chw + i * out_w + j];
            for (int c = 0; c < channels; ++c) {  // loop for channels
-              in_pos[0] += h2lambda * w2lambda * out_pos[0];
+              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
-              in_pos[wid] += h2lambda * w1lambda * out_pos[0];
+              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
-              in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0];
+              in_pos[hid * in_w] +=
-              in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0];
+                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
+              in_pos[hid * in_w + wid] +=
+                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
              in_pos += in_hw;
              out_pos += out_hw;
            }

--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
    // Sometimes direct copies will be faster, this maybe need deeply analysis.
    if (axis == 0 && outs.size() < 10) {
      size_t input_offset = 0;
-      auto in_stride = framework::stride_numel(in->dims());
+      const auto in_stride = framework::stride_numel(out_grad->dims());
-      for (auto& out : outs) {
+      for (size_t i = 0; i < outs.size(); ++i) {
-        out->mutable_data<T>(ctx.GetPlace());
+        auto out_stride = framework::stride_numel(ins[i]->dims());
-        auto out_stride = framework::stride_numel(out->dims());
+        auto* out = outputs[i];
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+        if (out != nullptr) {
-                                    out_stride, in->data<T>() + input_offset,
+          StridedNumelCopyWithAxis<T>(
-                                    in_stride, out_stride[axis]);
+              ctx.device_context(), axis, out->data<T>(), out_stride,
+              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
+        }
        input_offset += out_stride[axis];
      }
    } else {
-      std::vector<framework::Tensor> outputs(outs.size());
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = *outs[j];
-      }
      auto& dev_ctx = ctx.template device_context<DeviceContext>();
      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
          concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
+      concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
+                          &outputs);
    }
  }
 };

--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@@ -15,13 +15,13 @@
 #pragma once
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
-#define RPCSERVER_T detail::AsyncGRPCServer
+#define RPCSERVER_T distributed::AsyncGRPCServer
-#define RPCCLIENT_T detail::GRPCClient
+#define RPCCLIENT_T distributed::GRPCClient
 #else
-#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_client.h"
-#include "paddle/fluid/operators/detail/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
-#define RPCSERVER_T detail::AsyncBRPCServer
+#define RPCSERVER_T distributed::AsyncBRPCServer
-#define RPCCLIENT_T detail::BRPCClient
+#define RPCCLIENT_T distributed::BRPCClient
 #endif
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Detection mAP evaluate operator.
 The general steps are as follows. First, calculate the true positive and
- false positive according to the input of detection and labels, then
+false positive according to the input of detection and labels, then
- calculate the mAP evaluate value.
+calculate the mAP evaluate value.
- Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+Supporting '11 point' and 'integral' mAP algorithm. Please get more information
- from the following articles:
+from the following articles:
- https://sanchom.wordpress.com/tag/average-precision/
+https://sanchom.wordpress.com/tag/average-precision/
- https://arxiv.org/abs/1512.02325
+https://arxiv.org/abs/1512.02325
 )DOC");
  }

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
-if(NOT WITH_DISTRIBUTE)
-    return()
-endif()
 if(WITH_GRPC)
  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor

--- a/paddle/fluid/operators/detail/brpc_client.cc
+++ b/paddle/fluid/operators/detail/brpc_client.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 DEFINE_int32(brpc_channel_num, 24,
             "Number of channels to send requests connected to one server");
@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
  return q;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/brpc_client.h
+++ b/paddle/fluid/operators/detail/brpc_client.h
@@ -31,13 +31,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 struct ChannelContext {
  brpc::Channel channel;
@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient {
  DISABLE_COPY_AND_ASSIGN(BRPCClient);
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/brpc_server.cc
+++ b/paddle/fluid/operators/detail/brpc_server.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/detail/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 namespace sendrecv {
 typedef std::unordered_map<std::string,
-                           paddle::operators::detail::RequestHandler*>
+                           paddle::operators::distributed::RequestHandler*>
    HandlerMap;
 class BRPCServiceImpl : public SendRecvService {
@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService {
      : request_send_h_(nullptr),
        request_get_h_(nullptr),
        request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
    if (it != rpc_call_map.end()) {
      request_send_h_ = it->second;
    }
-    it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
    if (it != rpc_call_map.end()) {
      request_get_h_ = it->second;
    }
-    it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
    if (it != rpc_call_map.end()) {
      request_prefetch_h_ = it->second;
    }
@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService {
  }
 private:
-  paddle::operators::detail::RequestHandler* request_send_h_;
+  paddle::operators::distributed::RequestHandler* request_send_h_;
-  paddle::operators::detail::RequestHandler* request_get_h_;
+  paddle::operators::distributed::RequestHandler* request_get_h_;
-  paddle::operators::detail::RequestHandler* request_prefetch_h_;
+  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
 };
 }  // namespace sendrecv
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 void AsyncBRPCServer::StartServer() {
  // Instance of your service.
@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() {
  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/detail/brpc_server.h
+++ b/paddle/fluid/operators/detail/brpc_server.h
@@ -19,12 +19,12 @@ limitations under the License. */
 #include <string>
 #include "brpc/server.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class AsyncBRPCServer final : public RPCServer {
 public:
@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer {
  int ready_;
 };
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 GrpcByteBufferSource::GrpcByteBufferSource() {}
@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
  return byte_count_;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -106,7 +106,7 @@ class GrpcBufferReader final
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 // Source provides a way for a particular RPC implementation to provide
 // received data to ParseFrom.
 class Source {
@@ -183,6 +183,6 @@ class GrpcByteSource : public Source {
  char space_[sizeof(Reader)];
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #include <sys/time.h>
 #include <limits>
+#include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 void GRPCClient::InitImpl() { InitEventLoop(); }
@@ -75,6 +76,9 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Send";
+    VLOG(3) << var_h.String() << " begin";
    // stub context
    SendProcessor* s = new SendProcessor(ch);
@@ -129,6 +133,9 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Get";
+    VLOG(3) << var_h.String() << " begin";
    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -172,6 +179,9 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = out_var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Prefetch";
+    VLOG(3) << var_h.String() << " begin";
    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -243,10 +253,11 @@ void GRPCClient::Proceed() {
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);
    if (c->status_.ok()) {
+      VLOG(3) << c->var_h_.String() << " process";
      c->Process();
    } else {
-      LOG(FATAL) << "var: " << c->var_h_.String()
+      LOG(FATAL) << c->var_h_.String()
-                 << " grpc error:" << c->status_.error_message();
+                 << " meets grpc error:" << c->status_.error_message();
    }
    delete c;
    {
@@ -276,6 +287,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
  return ch;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -38,23 +38,27 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 struct VarHandle {
+  // RPC endpoint.
  std::string ep;
  const platform::DeviceContext* ctx;
  const framework::Scope* scope;
+  // Variable name.
  std::string name;
+  // RPC method name.
+  std::string method;
  std::string String() const {
    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
    return s.str();
  }
 };
@@ -226,6 +230,6 @@ class GRPCClient : public RPCClient {
  DISABLE_COPY_AND_ASSIGN(GRPCClient);
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_serde_test.cc
+++ b/paddle/fluid/operators/detail/grpc_serde_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  for (int i = 0; i < 564; ++i) rows->push_back(i);
  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
  // deserialize
@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  // deserialize zero-copy
  // framework::Variable var2;
-  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
+  operators::distributed::VariableResponse resp(&scope, &ctx);
  EXPECT_EQ(resp.Parse(msg), 0);
  framework::Variable* var2 = resp.GetVar();
@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  math::set_constant(ctx, tensor, 31.9);
  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
  // deserialize
@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  // deserialize zero-copy
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
+  operators::distributed::VariableResponse resp(&scope, &ctx);
  if (from_type == 0) {
    EXPECT_EQ(resp.Parse(msg), 0);
  } else {

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #include <limits>
 #include <string>
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
 using ::grpc::ServerAsyncResponseWriter;
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 enum CallStatus { PROCESS = 0, FINISH };
 // reference:
@@ -41,6 +41,19 @@ class RequestBase {
  virtual ~RequestBase() {}
  virtual void Process() = 0;
+  std::string Status2String(const std::string& method) {
+    std::string status = "Process";
+    if (status_ == FINISH) {
+      status = "Finish";
+    }
+    std::ostringstream s;
+    s << method << " name:[" << GetReqName() << "]"
+      << ", ep:[" << ctx_.peer() << "]"
+      << " " << status << " using req_id:" << req_id_;
+    return s.str();
+  }
  CallStatus Status() const {
    std::lock_guard<std::mutex> l(status_mu_);
    return status_;
@@ -74,7 +87,7 @@ class RequestSend final : public RequestBase {
    request_.reset(new VariableResponse(request_handler->scope(),
                                        request_handler->dev_ctx(),
                                        !request_handler->sync_mode()));
-    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
+    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@@ -106,7 +119,7 @@ class RequestGet final : public RequestBase {
                      ::grpc::ServerCompletionQueue* cq,
                      RequestHandler* request_handler, int req_id)
      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
+    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, &request_, &responder_, cq_, cq_,
        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@@ -150,7 +163,8 @@ class RequestPrefetch final : public RequestBase {
        local_scope_(nullptr) {
    request_.reset(new VariableResponse(request_handler->scope(),
                                        request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@@ -271,7 +285,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                          int req_id) {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
+    LOG(WARNING) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }
@@ -305,14 +319,14 @@ void AsyncGRPCServer::HandleRequest(
  bool ok = false;
  while (true) {
-    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
    if (!cq->Next(&tag, &ok)) {
      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
      break;
    }
    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
            << " get next";
    auto& reqs = rpc_reqs_[rpc_name];
@@ -323,22 +337,21 @@ void AsyncGRPCServer::HandleRequest(
      base = reqs[req_id];
    }
+    VLOG(3) << base->Status2String(rpc_name);
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
    if (!ok) {
      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event:argument name["
+                   << " recv no regular event"
-                   << base->GetReqName() << "]";
+                   << " context:" << base->Status2String(rpc_name);
      TryToRegisterNewOne(rpc_name, req_id);
      delete base;
      continue;
    }
-    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
-            << ", status:" << base->Status();
    switch (base->Status()) {
      case PROCESS: {
        base->Process();
@@ -354,6 +367,6 @@ void AsyncGRPCServer::HandleRequest(
  }
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -29,17 +29,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/distributed/grpc_service.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class RequestBase;
@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer {
  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -23,7 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -42,16 +42,17 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::VariableResponse.
 // Wire-format is identical to RecvVariableResponse.
 template <>
-class SerializationTraits<paddle::operators::detail::VariableResponse> {
+class SerializationTraits<paddle::operators::distributed::VariableResponse> {
 public:
  static Status Serialize(
-      const paddle::operators::detail::VariableResponse& msg,
+      const paddle::operators::distributed::VariableResponse& msg,
      grpc_byte_buffer** bp, bool* own_buffer) {
    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
    return Status();
  }
-  static Status Deserialize(grpc_byte_buffer* buffer,
+  static Status Deserialize(
-                            paddle::operators::detail::VariableResponse* msg,
+      grpc_byte_buffer* buffer,
+      paddle::operators::distributed::VariableResponse* msg,
      int max_message_size = INT_MAX) {
    if (buffer == nullptr) {
      return Status(StatusCode::INTERNAL, "No payload");
@@ -59,7 +60,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
    Status result = g_core_codegen_interface->ok();
    if (result.ok()) {
-      paddle::operators::detail::GrpcByteSource source(buffer);
+      paddle::operators::distributed::GrpcByteSource source(buffer);
      int ret = msg->Parse(&source);
      if (ret != 0) {
        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 enum class GrpcMethod {
  kSendVariable,
@@ -118,6 +119,6 @@ class GrpcService final {
  };
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 char* EncodeVarint32(char* dst, uint32_t v) {
  // Operate on characters as unsigneds
@@ -144,6 +144,6 @@ class ProtoEncodeHelper {
  char* limit_;  // Just for CHECKs
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -31,7 +31,7 @@
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
@@ -124,6 +124,6 @@ class RequestHandler {
  RPCServer* rpc_server_;
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -20,12 +20,12 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 bool RequestSendHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
@@ -119,6 +119,6 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
  return true;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -28,11 +28,11 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class RequestSendHandler final : public RequestHandler {
 public:
@@ -66,6 +66,6 @@ class RequestPrefetchHandler final : public RequestHandler {
              const std::string& out_var_name = "") override;
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/rpc_client.cc
+++ b/paddle/fluid/operators/detail/rpc_client.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 std::once_flag RPCClient::init_flag_;
 std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class RPCClient {
 public:
@@ -84,6 +84,6 @@ class RPCClient {
  static std::once_flag init_flag_;
  static std::unique_ptr<RPCClient> rpc_client_;
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -17,11 +17,11 @@
 #include <limits>
 #include <string>
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 void RPCServer::ShutDown() {
  LOG(INFO) << "RPCServer ShutDown ";
@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -19,11 +19,11 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class RPCServer {
 public:
@@ -86,6 +86,6 @@ class RPCServer {
  friend class RequestHandler;
 };
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/detail/rpc_server_test.cc
+++ b/paddle/fluid/operators/detail/rpc_server_test.cc
@@ -22,18 +22,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
-namespace detail = paddle::operators::detail;
+namespace distributed = paddle::operators::distributed;
 USE_OP(lookup_table);
-std::unique_ptr<detail::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<detail::RequestHandler> g_req_handler;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
  auto root_block = program->MutableBlock(0);
@@ -113,19 +113,21 @@ void StartServer() {
  g_req_handler->SetScope(&scope);
  g_req_handler->SetExecutor(&exe);
-  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
+  g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
+                             g_req_handler.get());
  g_req_handler->SetRPCServer(g_rpc_service.get());
  std::thread server_thread(
-      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
  server_thread.join();
 }
 TEST(PREFETCH, CPU) {
-  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
+  g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
  std::thread server_thread(StartServer);
  g_rpc_service->WaitServerReady();

--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
-#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 using VarMsg = sendrecv::VariableMessage;
@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
                               framework::Variable** var) {
-  operators::detail::VariableResponse resp(scope, &ctx);
+  operators::distributed::VariableResponse resp(scope, &ctx);
  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
  *var = resp.GetVar();
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -25,12 +25,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 typedef void (*DestroyCallback)(void*);
@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
  }
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include <string>
 #include <utility>
@@ -22,12 +22,12 @@
 #endif
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 enum WireType {
  WIRETYPE_VARINT = 0,
@@ -76,6 +76,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
      if (total_written + size_to_write > length) {
        size_to_write = length - total_written;
      }
+      // This log is useful to see how long a internal block size is of rpc.
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
      memory::Copy(boost::get<platform::CUDAPlace>(place),
                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
                   gpu_dev_ctx.stream());
@@ -103,6 +105,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
    }
    // TODO(gongwb): can we avoid copy?
    platform::CPUPlace cpu;
+    // This log is useful to see how long a internal block size is of rpc.
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
    p += size_to_write;
@@ -158,13 +162,13 @@ bool VariableResponse::CopySelectRowsTensorData(
  slr->set_height(meta_.slr_height());
  auto* tensor = slr->mutable_value();
  tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
+  PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
-      static_cast<size_t>(tensor->numel()),
                    length / framework::SizeOfType(
-                   paddle::operators::detail::ToTypeIndex(meta_.data_type())));
+                                 paddle::operators::distributed::ToTypeIndex(
+                                     meta_.data_type())));
  void* tensor_data = tensor->mutable_data(
      ctx.GetPlace(),
-      paddle::operators::detail::ToTypeIndex(meta_.data_type()));
+      paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
    return false;
@@ -480,6 +484,6 @@ int VariableResponse::Parse(Source* source) {
  return 0;
 }
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -22,17 +22,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class VariableResponse {
 public:
@@ -99,6 +99,6 @@ class VariableResponse {
  sendrecv::VariableMessage meta_;
 };
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase {
    // For profiling
    platform::RecordEvent record_event(Type(), &ctx);
-    detail::RPCClient* rpc_client =
+    distributed::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
    rpc_client->Wait();

--- a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include "paddle/fluid/operators/mean_op.h"
+namespace paddle {
+namespace operators {
+using framework::DataLayout;
+template <typename T>
+class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+    // The format of output is set as the mkldnn's format
+    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
+    tensor->set_layout(DataLayout::kMKLDNN);
+    tensor->set_format(mkldnn::memory::format::oihw);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::GaussianMKLDNNKernel<float>);
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
    return framework::OpKernelType(
        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.device_context(), layout, library);
  }
 };
@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                 "(int, default 5(FP32)) "
                 "Output data type.")
        .SetDefault(framework::proto::VarType::FP32);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 GaussianRandom Operator.

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 namespace paddle {
@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
    std::vector<std::string> endpoint_list =
        Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
    for (auto& ep : endpoint_list) {
      VLOG(3) << "sending nccl id to " << ep;
@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
    // NOTE: Can not use unique_ptr here because the default
    // deleter will call GRPC Server's base class's dtor and
    // that will cause a wired crash.
-    detail::RequestSendHandler rpc_h(true);
+    distributed::RequestSendHandler rpc_h(true);
-    std::unique_ptr<detail::RPCServer> rpc_service(
+    std::unique_ptr<distributed::RPCServer> rpc_service(
        new RPCSERVER_T(endpoint, 1));
-    rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
    rpc_h.SetRPCServer(rpc_service.get());
    framework::ProgramDesc empty_program;
@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
    rpc_h.SetExecutor(&executor);
    std::thread server_thread(
-        std::bind(&detail::RPCServer::StartServer, rpc_service.get()));
+        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
-    rpc_service->SetCond(detail::kRequestSend);
+    rpc_service->SetCond(distributed::kRequestSend);
    VLOG(3) << "start getting nccl id from trainer 0...";
-    rpc_service->WaitBarrier(detail::kRequestSend);
+    rpc_service->WaitBarrier(distributed::kRequestSend);
    VLOG(3) << "got nccl id and stop server...";
    rpc_service->ShutDown();
    VLOG(3) << "rpc server stopped";

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-void RunServer(std::shared_ptr<detail::RPCServer> service) {
+void RunServer(std::shared_ptr<distributed::RPCServer> service) {
  service->StartServer();
  VLOG(4) << "RunServer thread end";
 }
@@ -101,17 +101,16 @@ void ListenAndServOp::RunSyncLoop(
    framework::Scope *recv_scope,
    const std::vector<int> &prefetch_block_id_list) const {
  size_t num_blocks = program->Size();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");
-  std::vector<int> optimize_block_id_list;
+  std::vector<int> optimize_blocks_idx;
-  for (int blkid = 1; blkid < num_blocks; ++blkid) {
+  for (auto blk : optimize_blocks) {
-    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
+    optimize_blocks_idx.push_back(blk->ID());
-                  blkid) == prefetch_block_id_list.end()) {
-      optimize_block_id_list.push_back(blkid);
  }
-  }
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
-  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
  // Insert placeholder for block0 which holds current op itself.
  optimize_prepared.insert(
      optimize_prepared.begin(),
@@ -121,12 +120,12 @@ void ListenAndServOp::RunSyncLoop(
  while (true) {
    // Get from multiple trainers, we don't care about the order in which
    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(detail::kRequestSend);
+    rpc_service_->SetCond(distributed::kRequestSend);
-    rpc_service_->WaitBarrier(detail::kRequestSend);
+    rpc_service_->WaitBarrier(distributed::kRequestSend);
    if (rpc_service_->IsExit()) {
      LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(detail::kRequestGet);
+      rpc_service_->SetCond(distributed::kRequestGet);
      break;
    }
@@ -134,14 +133,14 @@ void ListenAndServOp::RunSyncLoop(
    // and this will still work.
    // The optimize blocks which have the same parent ID would run parallel
    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = program->Block(1).Parent();
+    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(1);
+    parallel_blkids.push_back(optimize_blocks[0]->ID());
    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
      // skip the first optimize block because it is already in the
      // parallel_blkids.
-      int blkid = optimize_block_id_list[i];
+      int blkid = optimize_blocks[i]->ID();
      if (program->Block(blkid).Parent() != last_parent_blkid) {
        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
                              program, recv_scope);
@@ -154,11 +153,11 @@ void ListenAndServOp::RunSyncLoop(
                          recv_scope);
    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-    rpc_service_->SetCond(detail::kRequestGet);
+    rpc_service_->SetCond(distributed::kRequestGet);
-    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
    rpc_service_->ResetBarrierCounter();
    // reset received sparse vars to avoid reuse it in the next mini-batch
-    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
        ->ResetSparseVarRecorder();
  }  // while(true)
 }
@@ -215,13 +214,13 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 }
 static void FillRequestCtx(
-    detail::RequestHandler *h, framework::Scope *scope,
+    distributed::RequestHandler *h, framework::Scope *scope,
    platform::DeviceContext *dev_ctx, framework::Executor *executor,
    framework::ProgramDesc *program,
    std::unordered_map<std::string,
                       std::shared_ptr<framework::ExecutorPrepareContext>>
        *prefetch_ctx,
-    detail::RPCServer *rpc_server) {
+    distributed::RPCServer *rpc_server) {
  h->SetScope(scope);
  h->SetDevCtx(dev_ctx);
  h->SetExecutor(executor);
@@ -249,18 +248,23 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
+  request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
-  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
  request_prefetch_handler_.reset(
-      new detail::RequestPrefetchHandler(sync_mode));
+      new distributed::RequestPrefetchHandler(sync_mode));
-  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestSend,
-  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
+                            request_send_handler_.get());
-  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+  rpc_service_->RegisterRPC(distributed::kRequestGet,
+                            request_get_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                            request_prefetch_handler_.get());
-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto optimize_blocks =
-  auto *program = optimize_block->Program();
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
+                 "optimize blocks should be 1 at least on the pserver side.");
+  auto *program = optimize_blocks[0]->Program();
  framework::Executor executor(dev_place);
  // prepare for prefetch
@@ -337,8 +341,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
        "a map from grad name to it's optimize block id")
        .SetDefault({});
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
+    AddAttr<std::vector<framework::BlockDesc *>>(
-                                    "BlockID to run on server side.");
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                      "prefetch blocks to run on server side.")
        .SetDefault({});

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -24,16 +24,16 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 namespace paddle {
 namespace operators {
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
-void RunServer(std::shared_ptr<detail::RPCServer> service);
+void RunServer(std::shared_ptr<distributed::RPCServer> service);
 class ListenAndServOp : public framework::OperatorBase {
 public:
@@ -62,10 +62,11 @@ class ListenAndServOp : public framework::OperatorBase {
               const platform::Place& dev_place) const override;
 protected:
-  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
+  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_prefetch_handler_;
  mutable std::shared_ptr<std::thread> server_thread_;
 };

--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                              paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                               paddle::operators::LogicalXorFunctor);
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -70,21 +70,23 @@ template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const int axis,
+                  const framework::Tensor& input,
-                  std::vector<framework::Tensor>* outputs) {
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
    // TODO(zcd): Add input data validity checking
-    int num = outputs->size();
+    size_t num = outputs->size();
    int input_rows = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
      input_rows *= dim_0[i];
    }
    int input_cols = 0;
    std::vector<int64_t> output_cols(outputs->size());
-    for (int i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i) {
-      int t_cols = outputs->at(i).numel() / input_rows;
+      int t_cols = ref_inputs[i]->numel() / input_rows;
      input_cols += t_cols;
      output_cols[i] = t_cols;
    }
@@ -94,11 +96,14 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
    for (int k = 0; k < input_rows; ++k) {
      const T* src_ptr = input.data<T>() + k * input_cols;
      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
+      for (size_t j = 0; j < num; ++j) {
        int col_len = output_cols[j];
-        T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
                       sizeof(T) * col_len);
+        }
        col_idx += col_len;
      }
    }

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -22,43 +22,24 @@ namespace paddle {
 namespace operators {
 namespace math {
-template <typename T>
-__device__ T upper_bound(const T* first, T count, T val) {
-  const T* orig = first;
-  const T* it = nullptr;
-  T step = 0;
-  while (count > 0) {
-    it = first;
-    step = count / 2;
-    it += step;
-    if (!(val < *it)) {
-      first = ++it;
-      count -= step + 1;
-    } else {
-      count = step;
-    }
-  }
-  return first - orig;
-}
 template <typename T>
 __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                             const int output_rows, const int output_cols,
                             T* output) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
-  int curr_offset = input_cols[segment];
-  int curr_segment = segment;
  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
+    int curr_col_offset = input_cols[curr_segment + 1];
-    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+    while (curr_col_offset <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
    }
    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
    T* input_ptr = inputs[curr_segment];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
@@ -89,24 +70,26 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
                                 const int in_col, const int* out_cols,
                                 int out_cols_size, T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
+  int curr_segment = 0;
-  int curr_offset = out_cols[segment];
+  int curr_offset = out_cols[0];
-  int curr_segment = segment;
  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
+    int curr_col_offset = out_cols[curr_segment + 1];
-    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
+    while (curr_col_offset <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
    }
    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
        output_ptr[tid_y * segment_width + local_col] =
            input_data[tid_y * in_col + tid_x];
    }
+  }
 }
 template <typename T>
@@ -118,11 +101,13 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
    int split = tid_x / fixed_out_col;
    int in_offset = tid_x - split * fixed_out_col;
    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
        output_ptr[tid_y * fixed_out_col + in_offset] =
            input_data[tid_y * in_col + tid_x];
    }
+  }
 }
 /*
@@ -203,17 +188,18 @@ template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
+                  const framework::Tensor& input,
-                  std::vector<framework::Tensor>* outputs) {
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
    // TODO(zcd): Add input data validity checking
    int o_num = outputs->size();
    int out_row = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
      out_row *= dim_0[i];
    }
-    int out_col = outputs->at(0).numel() / out_row;
+    int out0_col = ref_inputs[0]->numel() / out_row;
    int in_col = 0, in_row = out_row;
    bool sameShape = true;
@@ -223,13 +209,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    outputs_cols[0] = 0;
    for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i).numel() / out_row;
+      int t_col = ref_inputs.at(i)->numel() / out_row;
      if (sameShape) {
-        if (t_col != out_col) sameShape = false;
+        if (t_col != out0_col) sameShape = false;
      }
      in_col += t_col;
      outputs_cols[i + 1] = in_col;
-      outputs_ptr[i] = outputs->at(i).data<T>();
+      if (outputs->at(i) != nullptr) {
+        outputs_ptr[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_ptr[i] = nullptr;
+      }
    }
    T** dev_out_gpu_data =
@@ -255,7 +245,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
    } else {
      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(

--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const int axis, std::vector<framework::Tensor>* outputs);
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs);
 };
 }  // namespace math

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 #define DEFINE_CPU_TRANS(RANK)                                             \
  template struct Transpose<platform::CPUDeviceContext, platform::float16, \

--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);

--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
-    detail::RPCClient* rpc_client =
+    distributed::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {
          auto sum_op = framework::OpRegistry::CreateOp(
              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
          sum_op->Run(cur_scope, place);
          cur_scope.Rename(new_inside_name, inside_grad_name);

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
--- a/.clang_format.hook
+++ b/.clang_format.hook
--- a/.copyright.hook
+++ b/.copyright.hook
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py