diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index eeda759ff18ccb86ce6a585fe41cb972ea3ae295..e718b32cb6c48d11e73600509a17db107f438708 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     -   id: clang-format-with-version-check
         name: clang-format
         description: Format files with ClangFormat.
-        entry: bash ./.clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
@@ -52,7 +52,7 @@ repos:
     hooks:
     -   id: copyright_checker
         name: copyright_checker
-        entry: python ./.copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
         exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index b9eaca5ee6b487bb37bb954b3c606c3096d37aeb..707fadb1fae97cefe8a41715cd57d71754abda41 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -1,11 +1,18 @@
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Use UBUNTU_MIRROR can speed up apt-get speed.
+# ARG UBUNTU_MIRROR
+# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
-RUN pip install -U pip
-RUN pip install -U kubernetes paddlepaddle
 
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
+
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
 
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
 
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s
 
 ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+RUN pip install /*.whl && rm -f /*.whl 
 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD models/ /workspace/models/
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index acd803ddeee89b43c26ed2da506291f8a10aff7d..ece1102dce987cda994ff086b07f756498ce26e6 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                     break
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
-            if args.update_method == "pserver":
-                exe.bcast_params()
             if args.use_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
           (num_samples, train_elapsed, examples_per_sec))
 
 
+def print_paddle_envs():
+    print('----------- Configuration envs -----------')
+    for k in os.environ:
+        if "PADDLE_" in k:
+            print "ENV %s:%s" % (k, os.environ[k])
+    print('------------------------------------------------')
+
+
 def main():
     args = parse_args()
     print_arguments(args)
+    print_paddle_envs()
 
     # the unique trainer id, starting from 0, needed by trainer
     # only
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index f8afa3e9efd9b8780a08d937c9df3670baa735db..dfe8b5cdd58456902fa8ec355e9837dface3f7be 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -17,6 +17,7 @@ import copy
 import argparse
 import random
 import os
+import copy
 from kube_templates import pserver, trainer, envs
 
 
@@ -109,10 +110,9 @@ def gen_job():
 
     envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
     envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
     envs.append({"name": "ENTRY", "value": args.entry})
     envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
-    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
     # NOTE: these directories below are cluster specific, please modify
     # this settings before you run on your own cluster.
     envs.append({
@@ -166,7 +166,7 @@ def gen_job():
     tn["spec"]["template"]["spec"]["volumes"] = volumes
     tn_container["volumeMounts"] = volumeMounts
 
-    ps_container["env"] = envs
+    ps_container["env"] = copy.deepcopy(envs)
     ps_container["env"].append({
         "name": "PADDLE_TRAINING_ROLE",
         "value": "PSERVER"
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index acc8b4aa3fb258e5beef2d1e54919d429cf7ea6f..9ce6a9a7c329055a755cdb0a40c8c1c2af09a61c 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
 
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
 do
   python gen_doc.py ${module} > ${module}.rst
 done
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b3535b449eb0e5ac6563256ddac3bf4a27fd8ce6
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,46 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+transpiler
+==========
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+
+InferenceTranspiler
+-------------------
+
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+
+HashName
+--------
+
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+
+RoundRobin
+----------
+
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
index c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f..84005b54e07cf810649370d2c1f6b6c522434bf6 100644
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -13,6 +13,7 @@ cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
 cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
 ======================   ========================================
 
 从源码编译
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 192a6414260ce06048b8c765402d89882cabc51b..2a4bfc87069b9fd8ece58dde210a6cb8344da536 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -40,10 +40,9 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleBuf buf{.data = data, .length = sizeof(data)};
     PaddleTensor tensor{.name = "",
                         .shape = std::vector<int>({4, 1}),
-                        .data = buf,
+                        .data = PaddleBuf(data, sizeof(data)),
                         .dtype = PaddleDType::INT64};
 
     // For simplicity, we set all the slots with the same data.
@@ -55,14 +54,12 @@ void Main(bool use_gpu) {
 
     //# 4. Get output.
     ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
-    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
     }
-    // TODO(Superjomn): this is should be free automatically
-    free(outputs[0].data.data);
   }
 }
 
@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleBuf buf{.data = data, .length = sizeof(data)};
         PaddleTensor tensor{.name = "",
                             .shape = std::vector<int>({4, 1}),
-                            .data = buf,
+                            .data = PaddleBuf(data, sizeof(data)),
                             .dtype = PaddleDType::INT64};
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) {
         // 4. Get output.
         ASSERT_EQ(outputs.size(), 1UL);
         LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length;
-        const size_t num_elements = outputs.front().data.length / sizeof(float);
+                  << "output buffer size: " << outputs.front().data.length();
+        const size_t num_elements =
+            outputs.front().data.length() / sizeof(float);
         // The outputs' buffers are in CPU memory.
         for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
         }
-        free(outputs[0].data.data);
       }
     });
   }
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
index d67e1e7667800d6dd00cb8915b0d6dc7c664970b..dc2842ae0eeb5592b6d4571b70df162886aee7a2 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  assert(!other.memory_owned_);
+  data_ = other.data_;
+  length_ = other.length_;
+  memory_owned_ = other.memory_owned_;
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  assert(memory_owned_);
+  Free();
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e..bd4530fcf9518cb3bf06179d8f60a1dde38ff7dd 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
@@ -32,12 +33,38 @@ enum PaddleDType {
   INT64,
 };
 
-struct PaddleBuf {
-  void* data;     // pointer to the data memory.
-  size_t length;  // number of memory bytes.
+class PaddleBuf {
+ public:
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
 };
 
 struct PaddleTensor {
+  PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
   // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
@@ -67,8 +94,9 @@ class PaddlePredictor {
 
   // Predict an record.
   // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be alive until Run returns. caller should be
-  // responsible for releasing the memory of `output_data`.
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
                    std::vector<PaddleTensor>* output_data) = 0;
 
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
index 5bafc58fa53f7d99de571f66b6224f0f2de66e32..ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
     auto d_tensor_in_p = executor_.get_in(input.name);
     float *d_data_p = d_tensor_in_p->mutable_data();
     if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data),
+                   static_cast<float *>(input.data.data()),
                    d_tensor_in_p->valid_size() * sizeof(float),
                    cudaMemcpyHostToDevice) != 0) {
       LOG(ERROR) << "copy data from CPU to GPU error";
@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
   for (auto &output : *output_data) {
     auto *tensor = executor_.get_out(output.name);
     output.shape = tensor->shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
     // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data,
+    if (cudaMemcpy(output.data.data(),
                    tensor->mutable_data(),
                    tensor->valid_size() * sizeof(float),
                    cudaMemcpyDeviceToHost) != 0) {
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
index 1d41a5c73e75723f8614d810eae09ed8cdc8cf2b..f92e9d4190412f5847e353ef1dc0324cad668c9a 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -37,28 +37,26 @@ TEST(inference, anakin) {
 
   float data[1 * 3 * 224 * 224] = {1.0f};
 
-  PaddleBuf buf{.data = data, .length = sizeof(data)};
   PaddleTensor tensor{.name = "input_0",
                       .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = buf,
+                      .data = PaddleBuf(data, sizeof(data)),
                       .dtype = PaddleDType::FLOAT32};
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.emplace_back(std::move(tensor));
 
-  float data_out[1000];
-
-  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
   PaddleTensor tensor_out{.name = "prob_out",
                           .shape = std::vector<int>({1000, 1}),
-                          .data = buf_out,
+                          .data = PaddleBuf(),
                           .dtype = PaddleDType::FLOAT32};
 
-  std::vector<PaddleTensor> outputs(1, tensor_out);
+  std::vector<PaddleTensor> outputs;
+  outputs.emplace_back(std::move(tensor_out));
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
-  float* data_o = static_cast<float*>(outputs[0].data.data);
+  float* data_o = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < 1000; ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index bda2981a14482e2c4a29773d37b074506cc344b1..d9129a704bc289ce1d416474537fc9234a07e5b8 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
     std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data,
-                inputs[i].data.length);
+                inputs[i].data.data(),
+                inputs[i].data.length());
     feeds->push_back(input);
   }
   return true;
@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch(
     }
 
     outputs->at(i).shape = shape;
-    outputs->at(i).data.length = sizeof(float) * data.size();
-    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
-    std::memcpy(
-        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    auto &buffer = outputs->at(i).data;
+    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
+      buffer.Resize(sizeof(float) * data.size());
+    }
+    std::memcpy(buffer.data(), data.data(), buffer.length());
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 5d843010e02b09087e6b328428e80fb40eb5bb97..88c4e665a3daed0ed34b23b75d360acbd586401f 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -27,13 +27,12 @@ namespace paddle {
 
 PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   PaddleTensor pt;
-  pt.data.data = t->data<void>();
 
   if (t->type() == typeid(int64_t)) {
-    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
   } else if (t->type() == typeid(float)) {
-    pt.data.length = t->numel() * sizeof(float);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else {
     LOG(FATAL) << "unsupported type.";
@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
   std::vector<PaddleTensor> outputs;
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
   ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
-  float* data = static_cast<float*>(outputs[0].data.data);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < len / sizeof(float); ++j) {
     ASSERT_LT(data[j], 1.0);
     ASSERT_GT(data[j], -1.0);
@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
     EXPECT_LT(lod_data[i] - data[i], 1e-3);
     EXPECT_GT(lod_data[i] - data[i], -1e-3);
   }
-
-  free(outputs[0].data.data);
 }
 
 void MainImageClassification(bool use_gpu) {
@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
   std::vector<PaddleTensor> outputs;
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
   ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
-  float* data = static_cast<float*>(outputs[0].data.data);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
   float* lod_data = output1.data<float>();
   for (size_t j = 0; j < len / sizeof(float); ++j) {
     EXPECT_NEAR(lod_data[j], data[j], 1e-3);
   }
-  free(data);
 }
 
 void MainThreadsWord2Vec(bool use_gpu) {
@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {
 
       // check outputs range
       ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
       for (size_t j = 0; j < len / sizeof(float); ++j) {
         ASSERT_LT(data[j], 1.0);
         ASSERT_GT(data[j], -1.0);
@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
-      free(data);
     });
   }
   for (int i = 0; i < num_jobs; ++i) {
@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {
 
       // check outputs correctness
       ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
       float* ref_data = refs[tid].data<float>();
       EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
-      free(data);
     });
   }
   for (int i = 0; i < num_jobs; ++i) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 78356cb1be3bd089c26dde663275e2c8109df951..99dcaf27134e879fa85f57e5a675382442e9edf2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
+  balance_vars_.resize(places_.size(), 0);
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
          checker(op.InputArgumentNames(), recv_vars);
 }
 
+size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
+
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
-  std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
-    all_vars[var->Name()] = var;
+    all_vars_.emplace(var->Name(), var);
   }
 
   auto graph = new SSAGraph();
@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   auto send_vars = FindDistTrainSendVars(program);
   auto recv_vars = FindDistTrainRecvVars(program);
 
-  std::vector<std::unordered_set<std::string>> var_name_on_devices;
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
-  var_name_on_devices.resize(places_.size());
   bcast_var_name_set.resize(places_.size());
 
   size_t cur_device_id = 0;
-  std::vector<int64_t> balance_grads(places_.size(), 0);
-
-  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
-    auto var_desc = all_vars.at(g_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GE(numel, 0);
-    auto smallest =
-        std::min_element(std::begin(balance_grads), std::end(balance_grads));
-    size_t dev_id =
-        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
-    balance_grads[dev_id] += numel;
-    return dev_id;
-  };
-
   bool is_forwarding = true;
+
   for (auto *op : program.Block(0).AllOps()) {
     if (boost::get<int>(
             op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      // append rpc op if program is distributed trainer main program.
-      // always use the first device
       CreateRPCOp(&result, *op);
     } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
       CreateDistTrainOp(&result, *op);
@@ -199,15 +200,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         CreateScaleLossGradOp(&result);
       }
+      // This assumes the backward generating code will ensure IsScaleLossOp
+      // is true only for the op that scale the final scalar loss.
+      // It also assumes backward op will always follow the forward op in
+      // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(var_name_on_devices, *op);
+      int op_dev_id = GetOpDeviceID(*op);
       if (op_dev_id == -1) {  // var on all device
         CreateComputationalOps(&result, *op, places_.size());
       } else {
         CreateComputationalOp(&result, *op, op_dev_id);
         for (auto &var_name : op->OutputArgumentNames()) {
-          var_name_on_devices[op_dev_id].emplace(var_name);
+          var_name_on_devices_.emplace(var_name, op_dev_id);
         }
       }
       if (!is_forwarding && places_.size() > 1) {
@@ -230,19 +235,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
               switch (strategy_.reduce_) {
                 case BuildStrategy::ReduceStrategy::kReduce:
-                  cur_device_id = get_appropriate_dev(g_name);
+                  cur_device_id = GetAppropriateDeviceID({g_name});
                   CreateReduceOp(&result, g_name, cur_device_id);
-                  var_name_on_devices[cur_device_id].emplace(g_name);
+                  var_name_on_devices_.emplace(g_name, cur_device_id);
                   bcast_var_name_set[cur_device_id].emplace(p_name);
                   break;
                 case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(all_vars, g_name)) {
+                  if (IsSparseGradient(g_name)) {
                     CreateReduceOp(&result, g_name, 0);
                     CreateBroadcastOp(&result, g_name, 0);
                   } else {
                     InsertAllReduceOp(&result, g_name);
                   }
                   break;
+                default:
+                  LOG(FATAL) << "Unknown reduce strategy ";
+                  break;
               }
             }
           } catch (boost::bad_get e) {
@@ -261,7 +269,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   }
   /*
     Dependency graph has been constructed. However, there are still data
-    harzaeds need to be handled.
+    hazards need to be handled.
    */
   PolishGraphToSupportDataHazards(&result);
 
@@ -273,11 +281,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   return std::unique_ptr<SSAGraph>(graph);
 }
 
-bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, VarDesc *> &all_vars,
-    const std::string &og) const {
-  PADDLE_ENFORCE(all_vars.count(og) != 0);
-  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
     return true;
   }
   return false;
@@ -363,24 +369,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(
-    const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
-    const OpDesc &op) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
 
-  int var_dev_id = -1;
-  for (auto &var_name : op.InputArgumentNames()) {
-    if (var_dev_id != -1) break;
-    for (size_t i = 0; i < var_name_on_devices.size(); ++i) {
-      if (var_name_on_devices[i].count(var_name)) {
-        var_dev_id = static_cast<int>(i);
-        break;
-      }
+  for (auto &varname : op.InputArgumentNames()) {
+    int dev_id = GetVarDeviceID(varname);
+    if (dev_id != -1) {
+      return dev_id;
     }
   }
-  return var_dev_id;
+  return -1;
+}
+
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
+  auto got = var_name_on_devices_.find(varname);
+  return got == var_name_on_devices_.end() ? -1 : got->second;
 }
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
@@ -449,6 +454,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
   return var;
 }
 
+// Find the first occurence of `prev_op_name` and make current `op` depend
+// on it.
 void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
   for (auto &prev_op : result->ops_) {
@@ -463,16 +470,66 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                 const OpDesc &op) const {
-  CreateComputationalOp(result, op, 0);
+  int op_dev_id = -1;
+  if (op.Type() == "split_byref") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else if (op.Type() == "concat") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+  } else {
+    PADDLE_ENFORCE(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s", op.Type());
+
+  CreateComputationalOp(result, op, op_dev_id);
   if (op.Type() == "concat") {
     ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
   }
 }
 
+// Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
                                           const OpDesc &op) const {
-  result->ops_.emplace_back(
-      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
+  int op_dev_id = -1;
+  if (op.Type() == "send") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    // the variable name which contains .block means it was splited by
+    // split_byref op
+    // so that we can balance the variable blocks to all the pserver instances.
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
+        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+  } else if (op.Type() == "recv") {
+    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else {
+    // send_barrier and fetch_barrier op can be scheduled on device 0
+    op_dev_id = 0;
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
+                 op.Type());
+
+  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
+                                            op.Type(), places_[op_dev_id]));
 
   if (op.Type() == "send_barrier") {
     ConnectOp(result, result->ops_.back().get(), "send");
@@ -488,9 +545,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
         "send, send_barrier. recv, fetch_barrier]");
   }
 
-  // TODO(Yancey1989): schedule rpc op on different place may
-  // increate throughput
-  CreateOpHandleIOs(result, op, 0);
+  CreateOpHandleIOs(result, op, op_dev_id);
 }
 
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 78581755fe4890800636944d6cd89875a852cc19..eb1c07630ab665a90d76b810a421cffb0ce673c2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+  int GetVarDeviceID(const std::string &varname) const;
 
  private:
   void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t place_id) const;
+                         size_t device_id) const;
 
  private:
   std::string loss_var_name_;
@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
-  int GetOpDeviceID(
-      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
-      const OpDesc &op) const;
+  int GetOpDeviceID(const OpDesc &op) const;
 
   void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
 
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
-  bool IsSparseGradient(
-      const std::unordered_map<std::string, VarDesc *> &all_vars,
-      const std::string &og) const;
+  bool IsSparseGradient(const std::string &og) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
 
  private:
   BuildStrategy strategy_;
+  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+  mutable std::unordered_map<std::string, int> var_name_on_devices_;
+  mutable std::vector<int64_t> balance_vars_;
 
   void SetCommunicationContext(OpHandleBase *op_handle,
                                const platform::Place &p) const;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 5fc12a44b51fae26e5a8f5fdba952d3879e82d0f..9eb23c46264f9036f009b0ae9aeeb34ec70c0e53 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,6 +30,7 @@ class SSAGraphBuilder {
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+  virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 6c5098ce85b784a3edcf8f48d2cc828aabd8e161..b1706eb12d080364d04108c7ef4da31e1e7c1deb 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
+      std::lock_guard<std::mutex> l(exception_mu_);
       if (exception_) {
         auto exp = *exception_;
         exception_.reset();
@@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EnforceNotMet ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
       exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 4a2075f1cccb3211316567197da56c01d26f35ce..90430be996758364387b552019762d9c2e9dfe45 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
+  std::mutex exception_mu_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index b30a9806eb19ee12d2a70afe3ca806224b0f75d6..ae98fccc9600a2a75f12fa516c982bec0ef13f9f 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
 
 #ifdef PADDLE_WITH_DISTRIBUTE
 void Executor::Complete() {
-  ::paddle::operators::detail::RPCClient::GetInstance<
-      ::paddle::operators::detail::GRPCClient>()
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
       ->SendComplete();
 }
 #endif
@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 }
 
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars) {
+                                  bool create_local_scope, bool create_vars,
+                                  bool keep_kids) {
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     }
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  if (create_vars && create_local_scope) {
+  if (local_scope != scope) {
     scope->DeleteScope(local_scope);
   } else {
-    // Delete the local scopes created in operators.
-    scope->DropKids();
+    if (!keep_kids) {
+      // By default, we should delete all kid scopes after run executor because
+      // some operators may create local scope when running, such as while_op.
+      // But when while_op also create a local executor to run it's sub block,
+      // the sub scopes it created should not be dropped immediately, because
+      // while_grad_op will use some variables created during while_op run, so
+      // we need to keep the kids and wait for the outer executor to drop them.
+      scope->DropKids();
+    }
   }
+
   if (FLAGS_benchmark) {
     VLOG(2) << "-------------------------------------------------------";
     VLOG(2) << "Memory used after deleting local scope: "
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 67a0761dac2a9adcdd0ce2b218c4aa505d688d56..3aa5ffef69cd29681f248e915575c5715ad0d3fa 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -78,7 +78,7 @@ class Executor {
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           bool create_local_scope = true,
-                          bool create_vars = true);
+                          bool create_vars = true, bool keep_kids = false);
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           std::map<std::string, const LoDTensor*>* feed_targets,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 9406c6155da860c90739bddac1e81403b094e619..d478865fa8f24c653a4185cabd05747a5410ceaa 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-
   details::SSAGraphBuilderFactory builder_factory(
       member_->places_, loss_var_name, params, member_->local_scopes_,
       build_strategy);
@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
 #endif
   }
 
+  builder_ = std::move(builder_factory.Create());
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places,
-      builder_factory.Create()->Build(main_program)));
+      builder_->Build(main_program)));
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor(
 
 void ParallelExecutor::BCastParamsToGPUs(
     const std::unordered_set<std::string> &vars) const {
-  auto *main_scope = member_->local_scopes_[0];
+  // the the initialize bcast, all vars would be bcast from device(0), otherwise
+  // bcast from the specified device.
+  bool initialize = builder_.get() == nullptr ? true : false;
 
   for (auto &var : vars) {
-    auto *main_var = main_scope->FindVar(var);
+    int var_dev_id =
+        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    if (!initialize && var_dev_id == -1) continue;
+
+    framework::Variable *main_var = nullptr;
+    if (initialize) {
+      main_var = member_->local_scopes_[0]->FindVar(var);
+    } else {
+      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
+    }
+
     if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs(
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
-        if (i == 0) {
+
+        if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5247e790649e76567f4527d54499d6e95dac5c27..058f83f07c26224e3180d140630c08a24c40cd80 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -19,12 +19,14 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+
 namespace paddle {
 namespace framework {
 
@@ -68,6 +70,7 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
+  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index b75df33b71311acd0e626e5a13c18469b19ef136..c7f40d43c922a328febd343cea7240fcb09f3d02 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
   SubGraphFuse(graph, node_inside_subgraph_teller_);
 }
 
-}  // analysis
-}  // inference
+}  // namespace analysis
+}  // namespace inference
 
-}  // paddle
+}  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 8c08ae34306744f86d71e370fad47233c61ac8f8..4c338c67d34fa229de17019ce97e8b8dc39ea737 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,9 +184,9 @@ else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 
-add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-
+    add_subdirectory(distributed)
+    
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
         set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
@@ -195,20 +195,11 @@ if(WITH_DISTRIBUTE)
     endif()
 
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(checkpoint_notify_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(checkpoint_notify_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
-    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
+        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
+        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endforeach()
+    
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 8206cc9890160da756efb13c991020f09b20126a..cc158e57f7140c84f02bc7e091d8eac0d2b672e1 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -21,8 +21,6 @@ namespace operators {
 
 using batch_norm_bwd = mkldnn::batch_normalization_backward;
 using batch_norm_fwd = mkldnn::batch_normalization_forward;
-using framework::DataLayout;
-using framework::Tensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::reorder;
@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 namespace {
 template <typename T>
 struct bn_type_traits {
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 625ca2d7c4c70d1098b0fb28380d8d1eb24cb338..52b0bf85c07fee380f9e7ba1c703b56367628644 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -22,22 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class BatchNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 9e5fc41598f29336074335f3624a2300ad018d09..5e3d630d6889e445c5e84fa836d2d81bb7266779 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -19,6 +19,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index 2572e813d656353a2187c29da89266733a32f3ce..2dc3399da183fbcf7664066f6f7ce12db3dc6d5e 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
                   ops::BilinearInterpOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
-REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
+                       ops::BilinearInterpKernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
                        ops::BilinearInterpGradKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index 8b03cd5a0635584a45782fe5a4823c37fe4fa8e8..70847cb8c1abe2e94bc844ab8117d1f23fea533b 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
     int in_chw = channels * in_hw;
     int out_chw = channels * out_hw;
 
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
 
     if (in_h == out_h && in_w == out_w) {
       memcpy(output, input, input_t->numel() * sizeof(T));
@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
         for (int i = 0; i < out_h; ++i) {     // loop for images
           int h = ratio_h * i;
           int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1.f - h1lambda;
 
           for (int j = 0; j < out_w; ++j) {
             int w = ratio_w * j;
             int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1.f - w1lambda;
             // calculate four position for bilinear interpolation
             const T* in_pos = &input[k * in_chw + h * in_w + w];
             T* out_pos = &output[k * out_chw + i * out_w + j];
 
             for (int c = 0; c < channels; ++c) {  // loop for channels
               // bilinear interpolation
-              out_pos[0] =
+              out_pos[0] = static_cast<T>(
                   h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
                   h1lambda * (w2lambda * in_pos[hid * in_w] +
-                              w1lambda * in_pos[hid * in_w + wid]);
+                              w1lambda * in_pos[hid * in_w + wid]));
               in_pos += in_hw;
               out_pos += out_hw;
             }
@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
     int in_chw = channels * in_hw;
     int out_chw = channels * out_hw;
 
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
 
     if (in_h == out_h && in_w == out_w) {
       memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
         for (int i = 0; i < out_h; ++i) {     // loop for images
           int h = ratio_h * i;
           int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1 - h1lambda;
 
           for (int j = 0; j < out_w; ++j) {
             int w = ratio_w * j;
             int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1 - w1lambda;
             T* in_pos = &d_input[k * in_chw + h * in_w + w];
             const T* out_pos = &d_output[k * out_chw + i * out_w + j];
 
             for (int c = 0; c < channels; ++c) {  // loop for channels
-              in_pos[0] += h2lambda * w2lambda * out_pos[0];
-              in_pos[wid] += h2lambda * w1lambda * out_pos[0];
-              in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0];
-              in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0];
+              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
+              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
+              in_pos[hid * in_w] +=
+                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
+              in_pos[hid * in_w + wid] +=
+                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
               in_pos += in_hw;
               out_pos += out_hw;
             }
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h
index da1de72dad00db3ffe609e17bd198ef0a56bbfcd..b9e385994efcea0388756e8bd780ebfc719ed08d 100644
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@@ -15,13 +15,13 @@
 #pragma once
 
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
-#define RPCSERVER_T detail::AsyncGRPCServer
-#define RPCCLIENT_T detail::GRPCClient
+#include "paddle/fluid/operators/distributed/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
+#define RPCSERVER_T distributed::AsyncGRPCServer
+#define RPCCLIENT_T distributed::GRPCClient
 #else
-#include "paddle/fluid/operators/detail/brpc_client.h"
-#include "paddle/fluid/operators/detail/brpc_server.h"
-#define RPCSERVER_T detail::AsyncBRPCServer
-#define RPCCLIENT_T detail::BRPCClient
+#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
+#define RPCSERVER_T distributed::AsyncBRPCServer
+#define RPCCLIENT_T distributed::BRPCClient
 #endif
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
similarity index 97%
rename from paddle/fluid/operators/detail/CMakeLists.txt
rename to paddle/fluid/operators/distributed/CMakeLists.txt
index abc5aad0430e71928a441c9488dda16dfdd63b9c..312f80e09077f21a47985c1c936c2ac41c292ead 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -1,8 +1,3 @@
-if(NOT WITH_DISTRIBUTE)
-    return()
-endif()
-
-
 if(WITH_GRPC)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc
similarity index 98%
rename from paddle/fluid/operators/detail/brpc_client.cc
rename to paddle/fluid/operators/distributed/brpc_client.cc
index 9a4e410f1d83e93883438fae116c38eb60787673..b394c678fb6503eb73a1e11e6feb814251e9e940 100644
--- a/paddle/fluid/operators/detail/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 DEFINE_int32(brpc_channel_num, 24,
              "Number of channels to send requests connected to one server");
@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
   return q;
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h
similarity index 94%
rename from paddle/fluid/operators/detail/brpc_client.h
rename to paddle/fluid/operators/distributed/brpc_client.h
index 1e953ea431d51a9586bfd0b352c7f27d079ff1a8..34f140687f91d866536f5e2b647c7445a6624736 100644
--- a/paddle/fluid/operators/detail/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -31,13 +31,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 struct ChannelContext {
   brpc::Channel channel;
@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient {
   DISABLE_COPY_AND_ASSIGN(BRPCClient);
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
similarity index 86%
rename from paddle/fluid/operators/detail/brpc_server.cc
rename to paddle/fluid/operators/distributed/brpc_server.cc
index 2170abe679f9ededff3b53e3139e56f8aad227cb..862167f02084cfe81db1c0936bbfb0415fa85721 100644
--- a/paddle/fluid/operators/detail/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/detail/brpc_server.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace sendrecv {
 
 typedef std::unordered_map<std::string,
-                           paddle::operators::detail::RequestHandler*>
+                           paddle::operators::distributed::RequestHandler*>
     HandlerMap;
 
 class BRPCServiceImpl : public SendRecvService {
@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService {
       : request_send_h_(nullptr),
         request_get_h_(nullptr),
         request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
     if (it != rpc_call_map.end()) {
       request_send_h_ = it->second;
     }
 
-    it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
     if (it != rpc_call_map.end()) {
       request_get_h_ = it->second;
     }
 
-    it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
     if (it != rpc_call_map.end()) {
       request_prefetch_h_ = it->second;
     }
@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService {
   }
 
  private:
-  paddle::operators::detail::RequestHandler* request_send_h_;
-  paddle::operators::detail::RequestHandler* request_get_h_;
-  paddle::operators::detail::RequestHandler* request_prefetch_h_;
+  paddle::operators::distributed::RequestHandler* request_send_h_;
+  paddle::operators::distributed::RequestHandler* request_get_h_;
+  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
 };
 }  // namespace sendrecv
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 void AsyncBRPCServer::StartServer() {
   // Instance of your service.
@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() {
   VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
 
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/distributed/brpc_server.h
similarity index 88%
rename from paddle/fluid/operators/detail/brpc_server.h
rename to paddle/fluid/operators/distributed/brpc_server.h
index 0105c8074a46849031d8fa9c21a5507a982ec3c3..85a7ad0dfe843dad483d43631b69a79d75211ce9 100644
--- a/paddle/fluid/operators/detail/brpc_server.h
+++ b/paddle/fluid/operators/distributed/brpc_server.h
@@ -19,12 +19,12 @@ limitations under the License. */
 #include <string>
 
 #include "brpc/server.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 class AsyncBRPCServer final : public RPCServer {
  public:
@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer {
   int ready_;
 };
 
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
similarity index 94%
rename from paddle/fluid/operators/detail/bytebuffer_stream.cc
rename to paddle/fluid/operators/distributed/bytebuffer_stream.cc
index a14171563edb0ac9a22b7ae493c965de3efb7823..6e91b447db838c9095432eda22e9e1171e938d31 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 GrpcByteBufferSource::GrpcByteBufferSource() {}
 
@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
   return byte_count_;
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/distributed/bytebuffer_stream.h
similarity index 99%
rename from paddle/fluid/operators/detail/bytebuffer_stream.h
rename to paddle/fluid/operators/distributed/bytebuffer_stream.h
index 054dd4ff294414cca55d7e033f2c5403bbb85526..e7de172c79c30761483b5d96f5bad19860208832 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h
@@ -106,7 +106,7 @@ class GrpcBufferReader final
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 // Source provides a way for a particular RPC implementation to provide
 // received data to ParseFrom.
 class Source {
@@ -183,6 +183,6 @@ class GrpcByteSource : public Source {
   char space_[sizeof(Reader)];
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
similarity index 98%
rename from paddle/fluid/operators/detail/grpc_client.cc
rename to paddle/fluid/operators/distributed/grpc_client.cc
index c2e58c65ab18c66dfde229242022d294c0c12742..b6ff54a3e46f2d2eab4166094b29132581ed3257 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 
 #include <sys/time.h>
 
 #include <limits>
 
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 void GRPCClient::InitImpl() { InitEventLoop(); }
 
@@ -293,6 +293,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
   return ch;
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
similarity index 97%
rename from paddle/fluid/operators/detail/grpc_client.h
rename to paddle/fluid/operators/distributed/grpc_client.h
index 0c54ec0efefcc889440a257d59f6ab123ab8b06a..ae7b5e068bfc253eadd48cf2df3f070060e18b35 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -38,13 +38,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 struct VarHandle {
   std::string ep;
@@ -244,6 +244,6 @@ class GRPCClient : public RPCClient {
   DISABLE_COPY_AND_ASSIGN(GRPCClient);
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
similarity index 93%
rename from paddle/fluid/operators/detail/grpc_serde_test.cc
rename to paddle/fluid/operators/distributed/grpc_serde_test.cc
index 15892295e6901fe649788c9e34604008fc8cbdfa..3d107b533bcb7bfef3f9b13ec99afbd579a62e52 100644
--- a/paddle/fluid/operators/detail/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < 564; ++i) rows->push_back(i);
 
   ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
   EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
 
   // deserialize zero-copy
   // framework::Variable var2;
-  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
   framework::Scope scope;
   scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
+  operators::distributed::VariableResponse resp(&scope, &ctx);
   EXPECT_EQ(resp.Parse(msg), 0);
 
   framework::Variable* var2 = resp.GetVar();
@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   math::set_constant(ctx, tensor, 31.9);
 
   ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
   EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // deserialize zero-copy
   framework::Scope scope;
   scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
+  operators::distributed::VariableResponse resp(&scope, &ctx);
   if (from_type == 0) {
     EXPECT_EQ(resp.Parse(msg), 0);
   } else {
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
similarity index 97%
rename from paddle/fluid/operators/detail/grpc_server.cc
rename to paddle/fluid/operators/distributed/grpc_server.cc
index 9f4971dc12cfb56afc18d8f09cc8266842894f2c..b0f27042504248e47b5af38f86c02a5f0765fd63 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #include <limits>
 #include <string>
 
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
 
 using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 enum CallStatus { PROCESS = 0, FINISH };
 
 // reference:
@@ -74,7 +74,7 @@ class RequestSend final : public RequestBase {
     request_.reset(new VariableResponse(request_handler->scope(),
                                         request_handler->dev_ctx(),
                                         !request_handler->sync_mode()));
-    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
+    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
         reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@@ -106,7 +106,7 @@ class RequestGet final : public RequestBase {
                       ::grpc::ServerCompletionQueue* cq,
                       RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
+    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, &request_, &responder_, cq_, cq_,
         reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase {
         local_scope_(nullptr) {
     request_.reset(new VariableResponse(request_handler->scope(),
                                         request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
         reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@@ -399,6 +400,6 @@ void AsyncGRPCServer::HandleRequest(
   }
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/distributed/grpc_server.h
similarity index 85%
rename from paddle/fluid/operators/detail/grpc_server.h
rename to paddle/fluid/operators/distributed/grpc_server.h
index f1db7590f6f14d5d44acc12453861a446e278cd2..d2524f5e65db6dedab78f45e17380359b58a3d11 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc_server.h
@@ -29,17 +29,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/grpc_service.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 class RequestBase;
 
@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer {
   std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
 
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
similarity index 87%
rename from paddle/fluid/operators/detail/grpc_service.h
rename to paddle/fluid/operators/distributed/grpc_service.h
index cb745e125a9a5a6755dd98d46ee7aa6ed16da2ea..cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -23,7 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 
 #include "paddle/fluid/platform/profiler.h"
 
@@ -42,24 +42,25 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::VariableResponse.
 // Wire-format is identical to RecvVariableResponse.
 template <>
-class SerializationTraits<paddle::operators::detail::VariableResponse> {
+class SerializationTraits<paddle::operators::distributed::VariableResponse> {
  public:
   static Status Serialize(
-      const paddle::operators::detail::VariableResponse& msg,
+      const paddle::operators::distributed::VariableResponse& msg,
       grpc_byte_buffer** bp, bool* own_buffer) {
     PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
     return Status();
   }
-  static Status Deserialize(grpc_byte_buffer* buffer,
-                            paddle::operators::detail::VariableResponse* msg,
-                            int max_message_size = INT_MAX) {
+  static Status Deserialize(
+      grpc_byte_buffer* buffer,
+      paddle::operators::distributed::VariableResponse* msg,
+      int max_message_size = INT_MAX) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
 
     Status result = g_core_codegen_interface->ok();
     if (result.ok()) {
-      paddle::operators::detail::GrpcByteSource source(buffer);
+      paddle::operators::distributed::GrpcByteSource source(buffer);
       int ret = msg->Parse(&source);
       if (ret != 0) {
         result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 enum class GrpcMethod {
   kSendVariable,
@@ -121,6 +122,6 @@ class GrpcService final {
   };
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
similarity index 98%
rename from paddle/fluid/operators/detail/proto_encoder_helper.h
rename to paddle/fluid/operators/distributed/proto_encoder_helper.h
index d91d054b2507f32d1e948dde33da06a70cabe775..2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c 100644
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 char* EncodeVarint32(char* dst, uint32_t v) {
   // Operate on characters as unsigneds
@@ -144,6 +144,6 @@ class ProtoEncodeHelper {
   char* limit_;  // Just for CHECKs
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
similarity index 98%
rename from paddle/fluid/operators/detail/request_handler.h
rename to paddle/fluid/operators/distributed/request_handler.h
index 387a6b119030c8d7a629db9850adcb9b826b35f4..90742a201ad46447d6fbbe2137aa40fabc2f9983 100644
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -31,7 +31,7 @@
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
@@ -135,6 +135,6 @@ class RequestHandler {
   RPCServer* rpc_server_;
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
similarity index 96%
rename from paddle/fluid/operators/detail/request_handler_impl.cc
rename to paddle/fluid/operators/distributed/request_handler_impl.cc
index 859f6a75781cd325f3efeee59d9a1ae6a30dff88..b6e4e156080a67735d5cb2cbd6194bb4e52ea32d 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -20,13 +20,13 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Scope* scope,
@@ -138,6 +138,6 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
   return true;
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
similarity index 96%
rename from paddle/fluid/operators/detail/request_handler_impl.h
rename to paddle/fluid/operators/distributed/request_handler_impl.h
index 689c6893cf86b4d9bc27572e5267ff6fdb1dc6d3..87185500f2ffc3a8578eea339cc7a1e2b0e46631 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -28,11 +28,11 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 class RequestSendHandler final : public RequestHandler {
  public:
@@ -81,6 +81,6 @@ class RequestCheckpointHandler final : public RequestHandler {
   int checkpoint_notify_id;
 };
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
similarity index 88%
rename from paddle/fluid/operators/detail/rpc_client.cc
rename to paddle/fluid/operators/distributed/rpc_client.cc
index 9a791403e3d6b99c5d4de5183e83e1af655d7d4c..c71edf977c18e554c502732e9bf4bb4ea99f8f99 100644
--- a/paddle/fluid/operators/detail/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 std::once_flag RPCClient::init_flag_;
 std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
similarity index 98%
rename from paddle/fluid/operators/detail/rpc_client.h
rename to paddle/fluid/operators/distributed/rpc_client.h
index 71fdc864a611b05b93c3b54d5190934fddb7fd7a..b824da382625b2a584918c2f2b9bd90ab1c74035 100644
--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -22,7 +22,7 @@
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 class RPCClient {
  public:
@@ -88,6 +88,6 @@ class RPCClient {
   static std::once_flag init_flag_;
   static std::unique_ptr<RPCClient> rpc_client_;
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
similarity index 96%
rename from paddle/fluid/operators/detail/rpc_server.cc
rename to paddle/fluid/operators/distributed/rpc_server.cc
index cd0fe96e2301ee3304fe9a2967df58b9f7072d8d..fa0cb71b3056de92f65139c5402132fc8cbb7a87 100644
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -17,11 +17,11 @@
 #include <limits>
 #include <string>
 
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 void RPCServer::ShutDown() {
   LOG(INFO) << "RPCServer ShutDown ";
@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
       lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
similarity index 95%
rename from paddle/fluid/operators/detail/rpc_server.h
rename to paddle/fluid/operators/distributed/rpc_server.h
index 2e3342428cb56c34abaca655d5906668cda8f140..cf25e78435bb470b25a46db647ca818571cc83a5 100644
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -19,11 +19,11 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 class RPCServer {
  public:
@@ -86,6 +86,6 @@ class RPCServer {
   friend class RequestHandler;
 };
 
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
similarity index 87%
rename from paddle/fluid/operators/detail/rpc_server_test.cc
rename to paddle/fluid/operators/distributed/rpc_server_test.cc
index 463a7b80cfac280de5afe91ee85caaaf074cef32..a0693cffabcc561b0adfafc2c49027a890dd5efc 100644
--- a/paddle/fluid/operators/detail/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -22,18 +22,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
-namespace detail = paddle::operators::detail;
+namespace distributed = paddle::operators::distributed;
 
 USE_OP(lookup_table);
 
-std::unique_ptr<detail::RPCServer> g_rpc_service;
-std::unique_ptr<detail::RequestHandler> g_req_handler;
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
 
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
@@ -113,19 +113,21 @@ void StartServer() {
   g_req_handler->SetScope(&scope);
   g_req_handler->SetExecutor(&exe);
 
-  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
+  g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
+                             g_req_handler.get());
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
-      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
 
   server_thread.join();
 }
 
 TEST(PREFETCH, CPU) {
-  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
+  g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
   std::thread server_thread(StartServer);
   g_rpc_service->WaitServerReady();
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto
similarity index 100%
rename from paddle/fluid/operators/detail/send_recv.proto
rename to paddle/fluid/operators/distributed/send_recv.proto
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
similarity index 95%
rename from paddle/fluid/operators/detail/sendrecvop_utils.cc
rename to paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 507b465435609a91ebca97dd70b176c3b79bee02..98129d9f1014c39347e3409533f2bc10092611d2 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
-#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 using VarMsg = sendrecv::VariableMessage;
 
@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
                                framework::Variable** var) {
-  operators::detail::VariableResponse resp(scope, &ctx);
+  operators::distributed::VariableResponse resp(scope, &ctx);
   PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
   *var = resp.GetVar();
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
similarity index 92%
rename from paddle/fluid/operators/detail/sendrecvop_utils.h
rename to paddle/fluid/operators/distributed/sendrecvop_utils.h
index bd16bf1dab8d933ffd18b6d6d9e3ce1c7d73029b..fe25e73fa608727ba0bb912a82776b330ec8d83a 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -25,12 +25,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 typedef void (*DestroyCallback)(void*);
 
@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
   }
 }
 
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
similarity index 96%
rename from paddle/fluid/operators/detail/variable_response.cc
rename to paddle/fluid/operators/distributed/variable_response.cc
index 24cb91a3bb820a0e5d51aaa49154434919080f69..619890b1939be8777b89b94e415a3c2d63376658 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 
 #include <string>
 #include <utility>
@@ -22,12 +22,12 @@
 #endif
 #include "paddle/fluid/platform/profiler.h"
 
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 enum WireType {
   WIRETYPE_VARINT = 0,
@@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData(
   slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(tensor->numel()),
-      length / framework::SizeOfType(
-                   paddle::operators::detail::ToTypeIndex(meta_.data_type())));
+  PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
+                    length / framework::SizeOfType(
+                                 paddle::operators::distributed::ToTypeIndex(
+                                     meta_.data_type())));
   void* tensor_data = tensor->mutable_data(
       ctx.GetPlace(),
-      paddle::operators::detail::ToTypeIndex(meta_.data_type()));
+      paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
 
   if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
     return false;
@@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) {
   return 0;
 }
 
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
similarity index 92%
rename from paddle/fluid/operators/detail/variable_response.h
rename to paddle/fluid/operators/distributed/variable_response.h
index 69cfd784f8dd4f129f50c6882061e53e8535b949..1db4a0a522654ff2497b8bd9ee1381b5ab64067a 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -22,17 +22,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 
 class VariableResponse {
  public:
@@ -99,6 +99,6 @@ class VariableResponse {
   sendrecv::VariableMessage meta_;
 };
 
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 98b051afb551f373009d2bd3df1a8daa64b7e6c7..02beb80fc8a9f451393dcdd54492c4f88f908497 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     rpc_client->Wait();
 
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index f824eee4e7d1ef19c9a38fd5d3369265f9c549a0..697c239e59d158428ae9ba9f7feded19637dff28 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
 
     std::vector<std::string> endpoint_list =
         Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (auto& ep : endpoint_list) {
       VLOG(3) << "sending nccl id to " << ep;
@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
-    detail::RequestSendHandler rpc_h(true);
-    std::unique_ptr<detail::RPCServer> rpc_service(
+    distributed::RequestSendHandler rpc_h(true);
+    std::unique_ptr<distributed::RPCServer> rpc_service(
         new RPCSERVER_T(endpoint, 1));
 
-    rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
     rpc_h.SetRPCServer(rpc_service.get());
 
     framework::ProgramDesc empty_program;
@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
     rpc_h.SetExecutor(&executor);
 
     std::thread server_thread(
-        std::bind(&detail::RPCServer::StartServer, rpc_service.get()));
+        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
-    rpc_service->SetCond(detail::kRequestSend);
+    rpc_service->SetCond(distributed::kRequestSend);
     VLOG(3) << "start getting nccl id from trainer 0...";
-    rpc_service->WaitBarrier(detail::kRequestSend);
+    rpc_service->WaitBarrier(distributed::kRequestSend);
     VLOG(3) << "got nccl id and stop server...";
     rpc_service->ShutDown();
     VLOG(3) << "rpc server stopped";
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 3d67b2d2ea2f2a908fffd0bc08cdc515c2d50316..420c4e9e4ed2ef0b4a3b917f013360ad5a5dfa5b 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/macros.h"
 
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-void RunServer(std::shared_ptr<detail::RPCServer> service) {
+void RunServer(std::shared_ptr<distributed::RPCServer> service) {
   service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
@@ -123,12 +123,12 @@ void ListenAndServOp::RunSyncLoop(
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(detail::kRequestSend);
-    rpc_service_->WaitBarrier(detail::kRequestSend);
+    rpc_service_->SetCond(distributed::kRequestSend);
+    rpc_service_->WaitBarrier(distributed::kRequestSend);
 
     if (rpc_service_->IsExit()) {
       LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(detail::kRequestGet);
+      rpc_service_->SetCond(distributed::kRequestGet);
       break;
     }
 
@@ -156,11 +156,11 @@ void ListenAndServOp::RunSyncLoop(
                           recv_scope);
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
-    rpc_service_->SetCond(detail::kRequestGet);
-    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->SetCond(distributed::kRequestGet);
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
     rpc_service_->ResetBarrierCounter();
     // reset received sparse vars to avoid reuse it in the next mini-batch
-    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
         ->ResetSparseVarRecorder();
   }  // while(true)
 }
@@ -217,14 +217,14 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 }
 
 static void FillRequestCtx(
-    detail::RequestHandler *h, framework::Scope *scope,
+    distributed::RequestHandler *h, framework::Scope *scope,
     platform::DeviceContext *dev_ctx, framework::Executor *executor,
     framework::ProgramDesc *program,
     std::unordered_map<std::string,
                        std::shared_ptr<framework::ExecutorPrepareContext>>
         *prefetch_ctx,
     std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
-    detail::RPCServer *rpc_server) {
+    distributed::RPCServer *rpc_server) {
   h->SetScope(scope);
   h->SetDevCtx(dev_ctx);
   h->SetExecutor(executor);
@@ -255,16 +255,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
 
-  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
-  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
   request_prefetch_handler_.reset(
-      new detail::RequestPrefetchHandler(sync_mode));
-  request_checkpoint_handler_.reset(
-      new detail::RequestCheckpointHandler(sync_mode, checkpoint_notify_id));
-
-  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
-  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
-  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+      new distributed::RequestPrefetchHandler(sync_mode));
+  request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
+      sync_mode, checkpoint_notify_id));
+
+  rpc_service_->RegisterRPC(distributed::kRequestSend,
+                            request_send_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGet,
+                            request_get_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                             request_prefetch_handler_.get());
   rpc_service_->RegisterRPC(detail::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index ca2dafb737a826cd1ac98987a2bf6b2cc910a03e..a3dc6fc6220357ce0db1d7b8106f8a4abe0f3be0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +34,7 @@ constexpr char kOptimizeBlock[] = "OptimizeBlock";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 constexpr char kCheckpointBlockId[] = "checkpint_block_id";
 
-void RunServer(std::shared_ptr<detail::RPCServer> service);
+void RunServer(std::shared_ptr<distributed::RPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
@@ -64,11 +64,13 @@ class ListenAndServOp : public framework::OperatorBase {
                const platform::Place& dev_place) const override;
 
  protected:
-  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
-  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_checkpoint_handler_;
+  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_prefetch_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_checkpoint_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
 };
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index db109f5cd053d84718ac85bd4693ecece12ce172..26970db8d2af62bb06fce4eb1a1f21fd41617bd1 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                               paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index 14964fc62af6dd947e49a2054511780a1bb20cf2..55c8a472aca7fe700ef6a3f96bed1496d7b12b80 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -93,10 +93,10 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
     auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
-    for (size_t k = 0; k < input_rows; ++k) {
+    for (int k = 0; k < input_rows; ++k) {
       const T* src_ptr = input.data<T>() + k * input_cols;
       int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
+      for (size_t j = 0; j < num; ++j) {
         int col_len = output_cols[j];
         auto* out_tensor = outputs->at(j);
         if (out_tensor != nullptr) {
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index d39154c6f88d6d17c1719eb9a5b048211f4bb52b..c3387be6daa3bd34a6e3410ced23fce5d65f2cf7 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
 #define DEFINE_CPU_TRANS(RANK)                                             \
   template struct Transpose<platform::CPUDeviceContext, platform::float16, \
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index f71ba84b318c1f8b0604310f3db8a0826124e207..8734282fe496b8e90af19abd5549566d62316fc3 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 15dfb5469bf51330b98d6699fb3ce708222212ed..9854a31f5b10f5ecd940c0d41c2c3e468fc17bad 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < outs.size(); i++) {
       VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index c6c975a23ce846464388c72af5d8902144ceb16a..6b4572dcccc21e783f1df0b9bcde11d532ff4ba8 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 84ec36625314572d16e5c537884b6efec420cc60..0cac329aafa8c4c67cae48ba62a48575f5edba92 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 0ea273af9d5a5c8f1ae112232a9187675031b360..647cfc0a0af2be85e2868c6f68cab962c6631a8d 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -14,11 +14,14 @@
 
 #ifdef PADDLE_WITH_CUDA
 
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 8455d24ddf47382b235edda10cb9b2e8934c5f06..295d6ba0395b68cabab3bd4117cedd912df48f5d 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -16,10 +16,12 @@
 
 #ifdef PADDLE_WITH_CUDA
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 3a2fef48052ae3943abad14bf87c14ca79251c94..358e2d151bb8f990503ea8a51ba5f81e0a1dc816 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
                         const std::string& z_name, bool x_created,
                         const shape_t& x_shape, const shape_t& y_shape,
                         const shape_t& z_shape) {
-
     LOG(INFO) << "create fc op";
     auto* fc = block_desc.AppendOp();
     fc->SetType("mul");
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
index 5015b1005569ba70b147ebb795243e24ab81ea5c..e2b7b6b8e447381229e4ad594b7974bc0aa159d5 100644
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv);
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
-namespace detail = paddle::operators::detail;
+namespace distributed = paddle::operators::distributed;
 namespace string = paddle::string;
 
-std::unique_ptr<detail::RPCServer> g_rpc_service;
-std::unique_ptr<detail::RequestHandler> g_req_handler;
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
 
 void StartServer() {
   f::Scope scope;
@@ -57,14 +57,14 @@ void StartServer() {
   g_req_handler->SetProgram(&empty_program);
   g_req_handler->SetExecutor(&executor);
 
-  g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
+  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
-      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
 
-  g_rpc_service->SetCond(detail::kRequestSend);
-  g_rpc_service->WaitBarrier(detail::kRequestSend);
+  g_rpc_service->SetCond(distributed::kRequestSend);
+  g_rpc_service->WaitBarrier(distributed::kRequestSend);
 
   LOG(INFO) << "got nccl id and stop server...";
   g_rpc_service->ShutDown();
@@ -72,7 +72,7 @@ void StartServer() {
 }
 
 TEST(SendNcclId, RPCServer) {
-  g_req_handler.reset(new detail::RequestSendHandler(true));
+  g_req_handler.reset(new distributed::RequestSendHandler(true));
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
 
   std::thread server_thread(StartServer);
@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) {
 
   std::string ep = string::Sprintf("127.0.0.1:%d", port);
 
-  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
   LOG(INFO) << "connect to server" << ep;
   client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 74036bcb3114df8fc4613bd9f4dc327463397dba..dc02c6632e2a5265daf0c2f9949bdb94beec4232 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) {
              new (&instance) LoDTensor(new_offset_lod);
            })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
+      // We implement offset based LOD in C++ while we use length based with
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
+      // avoid misuse.
+      // The discussion is here:
+      // https://github.com/PaddlePaddle/Paddle/issues/10855
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
              // the input lod is offset-based level-of-detail info
@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) {
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              return new_lod;
            })
+      // Set above comments of set_lod.
       .def("recursive_sequence_lengths",
            [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
              // output the length-based lod info
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 93b09ed6922b32a5531224acc470daf0d97f95bd..6da3846ac69980daac4f0fb7401b2573c21c89bf 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
   auto buffer_info =
       details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
-                                  platform::float16>()(tensor);
+                                  uint8_t, platform::float16>()(tensor);
   return buffer_info;
 }
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5af5bc9c4731317075b3912a4749a0b358bdd56e..45af83708ea63fc1b6aa86f1e8423bb44b7388a6 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,7 +44,7 @@ import metrics
 import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
 from transpiler import DistributeTranspiler, InferenceTranspiler, \
     memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
               'profiler',
               'unique_name',
               'recordio_writer',
+              'Scope',
           ]
 
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index f96a2d282740dc943ae0fac2427e8f8efdaf5252..c859778b3757f638ac531620f241e684522add57 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -79,6 +79,61 @@ class DataToLoDTensorConverter(object):
 
 
 class DataFeeder(object):
+    """
+    DataFeeder converts the data that returned by a reader into a data
+    structure that can feed into Executor and ParallelExecutor. The reader
+    usually returns a list of mini-batch data entries. Each data entry in
+    the list is one sample. Each sample is a list or a tuple with one
+    feature or multiple features.
+
+    The simple usage shows below:
+
+    ..  code-block:: python
+
+        place = fluid.CPUPlace()
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+
+
+    If you want to feed data into GPU side separately in advance when you
+    use multi-GPU to train a model, you can use `decorate_reader` function.
+
+    ..  code-block:: python
+
+        place=fluid.CUDAPlace(0)
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(flowers.train(), batch_size=16))
+
+    Args:
+        feed_list(list): The Variables or Variables'name that will
+            feed into model.
+        place(Place): place indicates feed data into CPU or GPU, if you want to
+            feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
+            the GPU id), or if you want to feed data into CPU, please using
+            `fluid.CPUPlace()`.
+        program(Program): The Program that will feed data into, if program
+            is None, it will use default_main_program(). Default None.
+
+    Raises:
+        ValueError: If some Variable is not in this Program.
+
+    Examples:
+        .. code-block:: python
+
+            # ...
+            place = fluid.CPUPlace()
+            feed_list = [
+                main_program.global_block().var(var_name) for var_name in feed_vars_name
+            ] # feed_vars_name is a list of variables' name.
+            feeder = fluid.DataFeeder(feed_list, place)
+            for data in reader():
+                outs = exe.run(program=main_program,
+                               feed=feeder.feed(data))
+    """
+
     def __init__(self, feed_list, place, program=None):
         self.feed_dtypes = []
         self.feed_names = []
@@ -108,6 +163,16 @@ class DataFeeder(object):
         self.place = place
 
     def feed(self, iterable):
+        """
+        According to feed_list and iterable, converters the input into
+        a data structure that can feed into Executor and ParallelExecutor.
+
+        Args:
+            iterable(list|tuple): the input data.
+
+        Returns:
+            dict: the result of conversion.
+        """
         converter = []
         for lod_level, shape, dtype in six.zip(
                 self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
@@ -130,6 +195,20 @@ class DataFeeder(object):
         return ret_dict
 
     def feed_parallel(self, iterable, num_places=None):
+        """
+        Takes multiple mini-batches. Each mini-batch will be feed on each
+        device in advance.
+
+        Args:
+            iterable(list|tuple): the input data.
+            num_places(int): the number of devices. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Notes:
+            The number of devices and number of mini-batches must be same.
+        """
         if isinstance(self.place, core.CUDAPlace):
             places = [
                 core.CUDAPlace(i)
@@ -168,6 +247,24 @@ class DataFeeder(object):
                         multi_devices,
                         num_places=None,
                         drop_last=True):
+        """
+        Converter the input data into a data that returned by reader into
+        multiple mini-batches. Each mini-batch will be feed on each device.
+
+        Args:
+            reader(fun): the input data.
+            multi_devices(bool): the number of places. Default None.
+            num_places(int): the number of places. Default None.
+            drop_last(bool): the number of places. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Raises:
+            ValueError: If drop_last is False and the data batch which cannot
+            fit for devices.
+        """
+
         def __reader_creator__():
             if not multi_devices:
                 for item in reader():
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 33d8f709412b25d29c6618272500dd7b953d6645..159b0ca39eed547e4f3448e7ebf4807299d465b2 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -25,6 +25,13 @@ g_scope = core.Scope()
 
 
 def global_scope():
+    """
+    Get the global/default scope instance. There are a lot of APIs use
+    :code:`global_scope` as its default value, e.g., :code:`Executor.run`
+
+    Returns:
+        Scope: The global/default scope instance.
+    """
     return g_scope
 
 
@@ -37,6 +44,19 @@ def switch_scope(scope):
 
 @contextlib.contextmanager
 def scope_guard(scope):
+    """
+    Change the global/default scope instance by Python `with` statement. All
+    variable in runtime will assigned to the new scope.
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> new_scope = fluid.Scope()
+        >>> with fluid.scope_guard(new_scope):
+        >>>     ...
+
+    Args:
+        scope: The new global/default scope.
+    """
     ex = switch_scope(scope)
     yield
     switch_scope(ex)
@@ -135,14 +155,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
 
 def fetch_var(name, scope=None, return_numpy=True):
     """
-    Fetch the value of the variable with the given name from the given scope
+    Fetch the value of the variable with the given name from the
+    given scope.
+
     Args:
         name(str): name of the variable. Typically, only persistable variables
             can be found in the scope used for running the program.
         scope(core.Scope|None): scope object. It should be the scope where
             you pass to Executor.run() when running your program.
-            If None, global_scope() will be used.
-        return_numpy(bool): whether convert the tensor to numpy.ndarray
+            If None, global_scope() will be used. Default None.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray.
+            Default True.
+
     Returns:
        LodTensor|numpy.ndarray
     """
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ca118452711bd423f74e67c960e9a0bd9a69edc4..8ad78443fdeaa6a1cf28380f9aa71e21cdad0a35 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -30,8 +30,6 @@ __all__ = [
     'default_startup_program',
     'default_main_program',
     'program_guard',
-    'switch_startup_program',
-    'switch_main_program',
     'get_var',
 ]
 
@@ -43,7 +41,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 
 def grad_var_name(var_name):
     """
-    return gradient name for a certain var name
+    Returns:
+        str: gradient name for a certain var name
     """
     return var_name + GRAD_VAR_SUFFIX
 
@@ -51,10 +50,12 @@ def grad_var_name(var_name):
 def convert_np_dtype_to_dtype_(np_dtype):
     """
     Convert the data type in numpy to the data type in Paddle
+
     Args:
-        np_dtype(np.dtype): the data type in numpy
+        np_dtype(np.dtype): the data type in numpy.
 
-    Returns(core.VarDesc.VarType): the data type in Paddle
+    Returns:
+        core.VarDesc.VarType: the data type in Paddle.
 
     """
     dtype = np.dtype(np_dtype)
@@ -120,37 +121,53 @@ def _debug_string_(proto, throw_on_error=True):
 
 class Variable(object):
     """
-    Python variable. Every input and output of an operator is a variable. Every
-    variable belongs to a block. The variable has a name and two variables in
-    different blocks could have the same name.
+    In Fluid, every input and output of an operator is a variable. In most 
+    cases, variables are used for holding different kinds of data or training 
+    labels. A variable belongs to a block. All variable has its own name and 
+    two variables in different blocks could have the same name.
 
-    There are many kinds of variables. Please reference the framework.proto for
-    details.
+    There are many kinds of variables. Each kind of them has its own attributes 
+    and usages. Please reference the framework.proto for details. 
 
-    Notes: The constructor of Variable should not be invoked directly. Please
-    use `Block.create_var` to create a variable.
-
-    >>> cur_program = Program()
-    >>> cur_block = cur_program.current_block()
-    >>> new_variable = cur_block.create_var(
-    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+    Most of a Variable's member variables can be setted to be None. It mean 
+    it is not available or will be specified later.
 
     Args:
-        block(Block): The associated block. It will be passed by
-            `Block.create_var` automatically.
+        block(Block): The block that the variable belongs to.
         type(core.VarDesc.VarType): Variable type. Please reference the
             framework.proto for details.
-        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+        name(str|None): The name of the variable. If setted None, it will be
+            generated automatically. Default: None
+        shape(tuple|list|None): The shape of the variable. -1 means the batch size.
             Some kinds of variable do not contain shape, just set it to None.
-        dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
-        lod_level(int): The level of lod tensor. 0 means it is not a time
+            Default: None
+        dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable.
+            Default: None
+        lod_level (int|None): The level of lod tensor. 0 means it is not a time
             series data.
-        capacity(int): The capacity of Channel variable. Ignored
-            for other types.
-        persistable(bool): True if the variable should be saved as check point.
-            Defaults to False.
-        stop_gradient(bool): True if the variable will stop to calculate
-            gradients when backward. Defaults to False.
+            Default: None
+        capacity (int|None): The capacity of Channel variable. Ignored for other
+            types. Default: None
+        persistable (bool|None): True if the variable is persistable. A persistable
+            variable will not be deleted after an iteration ending. Defaults: None.
+        error_clip (BaseErrorClipAttr|None): The error clip attributes of the
+            corresponding gradient variable. Default: None
+        stop_gradient (bool): True if the variable will stop to calculate its
+            gradients when backward. Default: False.
+        is_data (bool): True if the variable is an input data. Default: False
+
+    Notes:
+        The constructor of Variable should not be invoked directly. Please
+        use `Block.create_var` to create a variable.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            new_variable = cur_block.create_var(name="X",
+                                                shape=[-1, 23, 48],
+                                                dtype='float32')
     """
 
     def __init__(self,
@@ -253,13 +270,14 @@ class Variable(object):
         Get debug string.
 
         Args:
-            throw_on_error(bool): True if raise an exception when self is not
-                intialized.
+            throw_on_error(bool): True if raise an exception when self is
+                not initialized.
             with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False;
 
+        Returns:
+            str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -276,6 +294,15 @@ class Variable(object):
     __repr__ = __str__
 
     def set_desc(self, input):
+        """
+        Set the variable description.
+
+        Args:
+            input(core.VarDesc): The new VarDesc.
+
+        Returns:
+            None
+        """
         self.desc = input
 
     @property
@@ -312,6 +339,15 @@ class Variable(object):
         return self.desc.type()
 
     def set_error_clip(self, error_clip):
+        """
+        Set the error_clip.
+
+        Args:
+            error_clip(BaseErrorClipAttr) : The new error_clip.
+
+        Returns:
+            None
+        """
         self.error_clip = error_clip
 
 
@@ -319,8 +355,8 @@ def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
 
-    Returns(list): list of OpProto
-
+    Returns:
+       list: list of OpProto.
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -373,9 +409,45 @@ class OpProtoHolder(object):
 
 class Operator(object):
     """
-    Python Operator class. The operator represents the build in instructions in a
-    Block. Users can use the build in instructions to describe their neural
-    network.
+    In Fluid, all the operation are represented by Operator, and Operator
+    is regarded as a build in an instruction of a Block. Users can use the
+    build in instructions to describe their neural network.
+
+    Args:
+        block(Block): The block has the current operator.
+        desc(core.OpDesc): The protobuf description of Operator.
+        type(str): The type of operator. Default None.
+        inputs(dict): The input of this Operator. it is a dictionary, for every
+            element, key is the input parameter name, and value is a list of
+            variables. Default None.
+        outputs(dict): The output of this Operator. it is a dictionary, for
+            every element, key is the input parameter name, and value is a list
+            of variables. Default None.
+        attrs(dict): The attributes of this Operator. it is a dictionary, for
+            every element, key is attribute name, and value is the attribute value.
+            The attribute type should be as same as the type registered in C++ side.
+            Default None.
+
+    Returns:
+        Operator: The initialized Operator.
+
+    Raises:
+        ValueError: If the passed input, output and attrs doesn't match the
+            initializing Operator's that registered in C++ side.
+
+    Notes:
+        The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            # var1 += var2 + var3
+            cur_block.append_op(type="sum",
+                                inputs={"X": [var1, var2, var3]},
+                                outputs={"Out": [var1]})
     """
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
@@ -392,31 +464,7 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-        """
-        Constructor.
 
-        Notes: The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block.prepend_op instead.
-
-        >>> cur_program = Program()
-        >>> cur_block = cur_program.current_block()
-        >>> # var1 += var2 + var3
-        >>> cur_block.append_op(type="sum",
-        >>>                     inputs={"X": [var1, var2, var3]},
-        >>>                     outputs={"Out": [var1]})
-
-        Args:
-            block(Block): The block has the current operator.
-            desc(core.OpDesc): The protobuf description.
-            type(str): The type of operator.
-            inputs(dict): The input dictionary. Key is the input parameter name.
-                Value is a list of variables.
-            outputs(dict): The output dictionary which has the same format with
-                           inputs.
-            attrs(dict): The attributes dictionary. Key is attribute name. Value
-                is the attribute value. The attribute type should be as same as
-                the type registered in C++
-        """
         self.block = block
         self.desc = desc
         self.attrs = attrs
@@ -529,12 +577,14 @@ class Operator(object):
 
     def to_string(self, throw_on_error):
         """
-        To debug string.
+        Get debug string.
+
         Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+            throw_on_error(bool): Whether to raise exception if self is not
+                initialized.
 
-        Returns(str): The debug string.
+        Returns:
+            str: The debug string.
 
         """
         protostr = self.desc.serialize_to_string()
@@ -552,29 +602,45 @@ class Operator(object):
 
     def input(self, name):
         """
-        Get input arguments by the input parameter name
-        Args:
-            name(str): The input parameter name
+        Get the input arguments according to the input parameter name.
 
-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The input parameter name.
 
+        Returns:
+            list: return the list of argument names that associated with \
+                the specific parameter name.
         """
         return self.desc.input(name)
 
     def rename_input(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's input.
+            new_name(str): The new name of the Operator's input.
+
+        Returns:
+            None
+        """
         self.desc.rename_input(old_name, new_name)
 
     def rename_output(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's output.
+            new_name(str): The new name of the Operator's output.
+
+        Returns:
+            None
+        """
         self.desc.rename_output(old_name, new_name)
 
     @property
     def input_names(self):
-        """
-        Get all input parameter names
-        Returns(list): return a list of input parameter names
-
-        """
         return self.desc.input_names()
 
     @property
@@ -587,33 +653,23 @@ class Operator(object):
 
     def output(self, name):
         """
-        Get output arguments by the output parameter name
-        Args:
-            name(str): The output parameter name
+        Get output arguments by the output parameter name.
 
-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The output parameter name.
 
+        Returns:
+            list: return the list of argument names associated with \
+                the specific parameter name.
         """
         return self.desc.output(name)
 
     @property
     def output_names(self):
-        """
-        Get all output parameter names
-        Returns(list): return a list of output parameter names
-
-        """
         return self.desc.output_names()
 
     @property
     def idx(self):
-        """
-        Return the array index of current operator.
-        Returns(int): The array index in block.ops array
-        Raises:
-            ValueError: when the operator is not found.
-        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -622,27 +678,40 @@ class Operator(object):
 
     def has_attr(self, name):
         """
-        operator has the attribute with name or not.
+        Whether this Operator has the attribute with name or not.
+
         Args:
-            name(str): the attribute name
+            name(str): the attribute name.
 
-        Returns(bool): True if has this attribute.
+        Returns:
+            bool: True if has this attribute.
 
         """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
         """
-        Get the type of attribute by attribute name
-        Args:
-            name(str): the attribute name
+        Get the type of attribute by attribute's name.
 
-        Returns(core.AttrType): the attribute type
+        Args:
+            name(str): the attribute name.
 
+        Returns:
+            core.AttrType: the attribute type.
         """
         return self.desc.attr_type(name)
 
     def set_attr(self, name, val):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
         self.attrs[name] = val
         if isinstance(val, Block):
             self.desc.set_block_attr(name, val.desc)
@@ -654,40 +723,39 @@ class Operator(object):
 
     @property
     def attr_names(self):
-        """
-        Get all attribute names
-        Returns(list): The list of attribute name
-
-        """
         return self.desc.attr_names()
 
     def attr(self, name):
         """
-        Get attribute by name
+        Get the attribute by name.
+
         Args:
-            name(str): the attribute name
+            name(str): the attribute name.
 
-        Returns(bool|int|str|float|list): The attribute value. The return value
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
             can be any valid attribute type.
-
         """
         return self.desc.attr(name)
 
     def block_attr(self, name):
         """
-        Get the block attribute by name
-        Args:
-            name(str): the attribute name
+        Get the block attribute by name.
 
-        Returns(int): the block index
+        Args:
+            name(str): the attribute name.
 
+        Returns:
+            int: the block index.
         """
         return self.desc.block_attr(name)
 
     def all_attrs(self):
         """
-        Get the attribute dict
-        Returns(dict): The Operator's attribute dict
+        Get the attribute dict.
+
+        Returns:
+            dict: The Operator's attribute dict.
         """
         attr_names = self.attr_names
         attr_map = {}
@@ -700,6 +768,35 @@ class Operator(object):
 
 
 class Block(object):
+    """
+    In Fluid, a Program is consistence of multi-Block, and Block stores
+    VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name.
+    One block could have some child blocks, and child block's name scopes
+    should inherit the parent's so that OpDesc in child block can reference
+    a VarDesc that is stored in the parent block.
+    Please reference the framework.proto for details.
+
+    Args:
+        program(Program): The Program that the Block belongs to.
+        idx(int): The block's id in the Program.
+
+    Notes:
+        The constructor of Block should not be invoked directly. Please
+        use `Program.create_block()` to create a block.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            var = cur_block.create_var(name="X",
+                                       shape=[-1, 23, 48],
+                                       dtype='float32')
+            cur_block.append_op(type="abs",
+                                inputs={"X": [var]},
+                                outputs={"Out": [var]})
+    """
+
     def __init__(self, program, idx):
         self.desc = program.desc.block(idx)
         self.vars = collections.OrderedDict()  # var_name --> var
@@ -712,15 +809,17 @@ class Block(object):
 
     def to_string(self, throw_on_error, with_details=False):
         """
-        To debug string.
+        Get debug string.
+
         Args:
             throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+                when throw_on_error is True.
             with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False.
 
+        Returns:
+            str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -752,6 +851,15 @@ class Block(object):
         return self.desc.get_forward_block_idx()
 
     def set_forward_block_idx(self, idx):
+        """
+        Set the forward block Idx.
+
+        Args:
+            idx(int): the block index.
+
+        Returns:
+            None
+        """
         self.desc.set_forward_block_idx(idx)
 
     @property
@@ -759,6 +867,19 @@ class Block(object):
         return self.desc.id
 
     def var(self, name):
+        """
+        Get a Variable by name from this block.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: The If input's type is not str, or this block
+                doesn't have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
         if not isinstance(name, basestring):
             raise TypeError(
                 "var require string as parameter, but get %s instead." %
@@ -769,6 +890,19 @@ class Block(object):
         return v
 
     def var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
         frontier = list()
         visited = set()
 
@@ -815,6 +949,18 @@ class Block(object):
     def rename_var(self, name, new_name):
         """
         Rename variable in vars and ops' inputs and outputs
+
+        Args:
+            name(str): the name that need to be renamed.
+            new_name(str): the name that need to rename to.
+
+        Raises:
+            ValueError: If this block doesn't have this the giving name,
+                or the type of the var with the giving name is not Parameter
+                or Variable.
+
+        Returns:
+            Variable: the Variable with the giving name.
         """
         if not self.has_var(name):
             raise ValueError("var %s is not in current block" % name)
@@ -878,12 +1024,27 @@ class Block(object):
         return param
 
     def append_op(self, *args, **kwargs):
+        """
+        Appends a new Operator according to the giving arguments.
+
+        Returns:
+            Operator: the append Operator.
+        """
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         self.ops.append(op)
         return op
 
     def insert_op(self, index, *args, **kwargs):
+        """
+        Insert a Operator according to the giving arguments.
+
+        Args:
+            index(int): the place that the operator to insert.
+
+        Returns:
+            Operator: the insert Operator.
+        """
         self.sync_with_cpp()
         op_desc = self.desc.insert_op(index)
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
@@ -891,11 +1052,30 @@ class Block(object):
         return op
 
     def remove_op(self, index):
+        """
+        Remove the specific position operator.
+
+        Args:
+            index(int): the position that the operator to insert.
+
+        Returns:
+            None
+        """
         self.sync_with_cpp()
         self.desc.remove_op(index, index + 1)
         del self.ops[index]
 
     def slice_ops(self, start, end):
+        """
+        Return the Operator between start and end.
+
+        Args:
+            start(int): the start position.
+            end(int): the end position.
+
+        Returns:
+            list: the Operators between start and end.
+        """
         return self.ops[start:end]
 
     def prepend_op(self, *args, **kwargs):
@@ -906,9 +1086,8 @@ class Block(object):
 
     def sync_with_cpp(self):
         """
-        Sync from the desc on the c++ end.
-
-        This method is used to synchronize the c++ desc instance generated by backward.
+        Sync from the desc on the c++ end. This method is used to synchronize
+        the c++ desc instance generated by backward.
         """
         # sync variables from cpp
         for var in self.desc.all_vars():
@@ -973,9 +1152,14 @@ class Block(object):
 
     def copy_param_info_from(self, other):
         """
-        Copy the information of parameters from the other block
+        Copy the information of parameters from the other block.
+
         Args:
-            other(Block): the other block
+            other(Block): the other block.
+
+        Raises:
+            ValueError: If type of input is not Block, or the `other` and this
+                block is not in the same topology.
 
         Returns:
             None
@@ -1007,11 +1191,12 @@ class Block(object):
     def clone_variable(self, var):
         """
         Clone a variable into current block.
+
         Args:
             var: the variable to be cloned.
 
         Returns:
-            The new  variable cloned from 'var' in current block.
+            Variable: the new  variable cloned from 'var' in current block.
         """
         assert isinstance(var, Variable)
         ret_var = None
@@ -1054,23 +1239,18 @@ class Program(object):
     Notes: we have default_startup_program and default_main_program
     by default, a pair of them will shared the parameters.
     The default_startup_program only run once to initialize parameters,
-    default_main_program run in every minibatch and adjust the weights.
-
-    Args:
-        None
+    default_main_program run in every mini batch and adjust the weights.
 
     Returns:
-        Python Program
+        A empty program.
 
     Examples:
-       .. code-block:: python
-
-         main_program = Program()
-         startup_program = Program()
-         with fluid.program_guard(main_program=main_program, startup_program=startup_program):
-            fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
-            fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
-            fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+        >>>     fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+        >>>     fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+        >>>     fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
 
     """
 
@@ -1084,6 +1264,19 @@ class Program(object):
 
     @property
     def op_role(self):
+        """
+        The operator role. In a enum {Forward, Backward, Optimize}.
+
+        Notes: this is a low level API. It is used only for ParallelExecutor to
+        duplicate or schedule operator to devices.
+
+        For example, the forward operator should be executed on every device.
+        The backward operator should be executed on every device and the
+        parameter gradient of backward (use :code:`op_role_var` to get this
+        variable) operator should be merged to one device. The optimization
+        operators should be executed on only one device and broadcast the
+        optimization result, i.e., the new parameter, to every other device.
+        """
         return self._current_role
 
     @op_role.setter
@@ -1092,6 +1285,13 @@ class Program(object):
 
     @property
     def op_role_var(self):
+        """
+        The auxiliary variables for :code:`op_role` property.
+
+        See Also: :code:`Program.op_role`'s documentation for details.
+
+        Notes: This is a very low-level API. Users should not use it directly.
+        """
         return self._op_role_var
 
     @op_role_var.setter
@@ -1100,6 +1300,21 @@ class Program(object):
 
     @contextlib.contextmanager
     def optimized_guard(self, var):
+        """
+        A with guard to set :code:`Optimization` :code:`OpRole` and
+        :code:`OpRoleVar` automatically.
+
+        Notes: This is a very low level API. Users should not use it directly.
+
+        Args:
+            var(Variable|str): The variable (name) to be optimized.
+
+        Examples:
+
+            >>> p, g = backward(...)
+            >>> with program.optimized_guard(p):
+            >>>     p = p - 0.001 * g
+        """
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
         self._op_role_var = [var.name if isinstance(var, Variable) else var]
@@ -1108,18 +1323,35 @@ class Program(object):
         self._current_role = OpRole.Forward
 
     def __str__(self):
+        """
+        Get the protobuf debug string of this Program.
+
+        Returns:
+            (str): The protobuf debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set.
+        """
         return self.to_string(True)
 
     def to_string(self, throw_on_error, with_details=False):
         """
         To debug string.
+
         Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+            throw_on_error(bool): raise Value error when any of required fields
+                is not set.
 
-        Returns(str): The debug string.
+            with_details(bool): True if more details about variables and
+                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
+                to print.
+
+        Returns
+            (str): The debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set and throw_on_error is
+                True.
 
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
@@ -1135,25 +1367,89 @@ class Program(object):
         return res_str
 
     def get_desc(self):
+        """
+        Get the C++ side of `ProgramDesc` object pointer. The C++ object is
+        exposed by :code:`pybind`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly.
+        """
         return self.desc
 
     def clone(self, for_test=False):
-        """Clone the Program object
-        Args:
-           for_test(bool): indicate whether clone for test.
+        """
+        Create a new, duplicated program.
+
 
-        Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.
+        Some operators, e.g., :code:`batch_norm`, behave differently between
+        training and testing. They have an attribute, :code:`is_test`, to
+        control this behaviour. This method will change the :code:`is_test`
+        attribute of them to :code:`True` when :code:`for_test=True`.
+
+        * Set for_test to False when we want to clone the program for training.
+        * Set for_test to True when we want to clone the program for testing.
+
+        Notes: This API DOES NOT prune any operator. Use
+        :code:`clone(for_test=True)` before backward and optimization please.
 
         Args:
-            for_test(bool): Some operators, such as batch_norm and drop_out ops,
-                behave differently in training and testing. If for_test is True,
-                the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.
+            for_test(bool): True if change the :code:`is_test` attribute of
+                operators to :code:`True`.
 
         Returns:
-            Program: The cloned Program object.
-
+            Program: The new, duplicated Program object.
+
+        Examples:
+
+            1. To clone a test program, the sample code is:
+
+            >>> import paddle.fluid as fluid
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>
+            >>> test_program = train_program.clone(for_test=True)
+            >>>
+            >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     sgd.minimize(loss)
+
+            2. The :code:`clone` method can be avoid if you create program for
+            training and program for testing individually.
+
+            >>> import paddle.fluid as fluid
+            >>>
+            >>> def network(is_test):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>     return loss
+            >>>
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> test_program = fluid.Program()
+            >>>
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=False)
+            >>>         sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>>         sgd.minimize(loss)
+            >>>
+            >>> # the test startup program is not used.
+            >>> with fluid.program_guard(test_program, fluid.Program()):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=True)
+
+            The two code snippets above will generate same programs.
         """
         if for_test:
             p = self.inference_optimize()
@@ -1168,6 +1464,21 @@ class Program(object):
         return p
 
     def prune(self, targets):
+        """
+        Prune operators and variables which are not needed to generate
+        :code:`targets`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly. This API is in flux and not stable.
+
+        Args:
+            targets(list|Variable|Operator): A list of variables or operators
+                need to be pruned
+
+        Returns:
+            Program:  A new, pruned program.
+
+        """
         if not isinstance(targets, list):
             targets = [targets]
         targets_idx = []
@@ -1202,6 +1513,17 @@ class Program(object):
         return res
 
     def inference_optimize(self):
+        """
+        This method will create a new program and change the :code:`is_test`
+        attribute of operators to :code:`True`. All the :code:`Parameter`
+        information will be lost.
+
+        Notes: This API is a very low level API. Use
+        :code:`Program.clone(for_test=True)` instead.
+
+        Returns:
+            Program: The new program.
+        """
         # this is an alternative implement before
         # core.inference_optimize being fixed.
         res = Program()
@@ -1218,6 +1540,18 @@ class Program(object):
 
     @staticmethod
     def parse_from_string(binary_str):
+        """
+        Deserialize a program desc from protobuf binary string.
+
+        Notes: All information about parameters will be lost after serialization
+        and deserialization.
+
+        Args:
+            binary_str(str): The binary prootbuf string.
+
+        Returns:
+            Program: A deserialized program desc.
+        """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
         p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
@@ -1226,10 +1560,19 @@ class Program(object):
 
     @property
     def random_seed(self):
+        """
+        The default random seed for random operators in Program. Zero means get
+        the random seed from random device.
+
+        Notes: It must be set before the operators have been added.
+        """
         return self._seed
 
     @property
     def num_blocks(self):
+        """
+        The number of blocks in this program.
+        """
         return self.desc.num_blocks()
 
     @random_seed.setter
@@ -1242,15 +1585,40 @@ class Program(object):
         return str(self)
 
     def global_block(self):
+        """
+        Get the first block of this program.
+        """
         return self.blocks[0]
 
     def block(self, index):
+        """
+        Get the :code:`index` block of this program
+        Args:
+            index(int): The index of block to get
+
+        Returns:
+            Block: The :code:`index` block
+        """
         return self.blocks[index]
 
     def current_block(self):
+        """
+        Get the current block. The :code:`current` block is the block to append
+        operators.
+        """
         return self.blocks[self.current_block_idx]
 
     def create_block(self, parent_idx=None):
+        """
+        Create a new block with the :code:`parent_idx` and change the current block
+        to new block.
+
+        Args:
+            parent_idx(int): The parent block index.
+
+        Returns:
+            Block: The new block.
+        """
         new_block_idx = len(self.blocks)
         parent = self.current_block() if parent_idx is None else self.block(
             parent_idx)
@@ -1260,9 +1628,24 @@ class Program(object):
         return self.current_block()
 
     def rollback(self):
+        """
+        Exit a code block, i.e., roll back to the parent block.
+        Returns:
+            None
+        """
         self.current_block_idx = self.current_block().parent_idx
 
     def sync_with_cpp(self):
+        """
+        Synchronize Python instance to its binding C++ object instance.
+        If the program is modified in C++ space, this method should be invoked.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
+        Returns:
+            None
+        """
         for block_idx in range(len(self.blocks), self.desc.num_blocks()):
             self.blocks.append(Block(self, block_idx))
         for block in self.blocks:
@@ -1272,6 +1655,9 @@ class Program(object):
         """
         Copy the information of parameters from other program.
 
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
         Args:
             other(Program): Other program
 
@@ -1291,6 +1677,9 @@ class Program(object):
         """
         Copy the information of data variables from other program.
 
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
         Args:
             other(Program): Other program
 
@@ -1309,12 +1698,41 @@ class Program(object):
                 self.global_block().var(var.name).is_data = True
 
     def list_vars(self):
+        """
+        Get all variables from this Program. A iterable object is returned.
+
+        Returns:
+            iterable: The generator will yield every variable in this program.
+        """
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
                 yield each_var
 
 
 class Parameter(Variable):
+    """
+    Parameter is derived from Variable. A parameter is a persistable 
+    Variable, and will be updated by optimizers after each iteration.
+    The training of a neural network is essentially the updating of 
+    its parameters.
+
+    Relative to a general Variable, a Parameter has several its own
+    member variables:
+
+    Args:
+        trainable(bool): True if the parameter need to be updated after
+            iterations.
+        optimize_attr(map): Parameter attributes related with optimizing.
+            Currently, it only contains 'learning_rate'.
+            Default: {'learning_rate': 1.0}
+        regularizer(WeightDecayRegularizer): The Regularizer which will
+            be applied on the parameter. Default: None
+        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+            which will be applied on the parameter. Default: None
+        do_model_average(bool): True if the model average strategy will
+            be applied on this parameter.
+    """
+
     def __init__(self, block, shape, dtype, **kwargs):
         if shape is None or dtype is None:
             raise ValueError("Parameter must set shape and dtype")
@@ -1377,8 +1795,15 @@ _startup_program_ = Program()
 
 def default_startup_program():
     """
-    Get default startup program. In startup program, Paddle will initialize
-    parameters, initialize nccl handle, etc.
+    Get default/global startup program.
+
+    The layer function in :code:`fluid.layers` will create parameters, readers,
+    NCCL handles as global variables. The :code:`startup_program` will
+    initialize them by the operators in startup program. The layer function will
+    append these initialization operators into startup program.
+
+    This method will return the :code:`default` or the :code:`current` startup
+    program. Users can use :code:`fluid.program_guard` to switch program.
 
     Returns:
         Program: startup program
@@ -1388,7 +1813,15 @@ def default_startup_program():
 
 def default_main_program():
     """
-    Get default main program. The main program is used for training or testing.
+    Get default/global main program. The main program is used for training or
+    testing.
+
+    All layer function in :code:`fluid.layers` will append operators and
+    variables to the :code:`default_main_program`.
+
+    The :code:`default_main_program` is the default program in a lot of APIs.
+    For example, the :code:`Executor.run()` will execute the
+    :code:`default_main_program` when the program is not specified.
 
     Returns:
         Program: main program
@@ -1430,20 +1863,34 @@ def switch_startup_program(program):
 @contextlib.contextmanager
 def program_guard(main_program, startup_program=None):
     """
-    Switch program with `with` statement
+    Change the global main program and startup program with `with` statement.
+    Layer functions in the Python `with` block will append operators and
+    variables to the new main programs.
 
     Examples:
-        >>> with program_guard(Program()):
-        >>>   data = fluid.layers.data(...)
-        >>>   hidden = fluid.layers.fc(...)
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program, startup_program):
+        >>>     data = fluid.layers.data(...)
+        >>>     hidden = fluid.layers.fc(...)
+
+    Notes: The temporary :code:`Program` can be used if the user does not need
+    to construct either of startup program or main program.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> # does not care about startup program. Just pass a temporary value.
+        >>> with fluid.program_guard(main_program, fluid.Program()):
+        >>>     data = ...
 
     Args:
-        main_program(Program): New main program inside `with` statement
+        main_program(Program): New main program inside `with` statement.
         startup_program(Program): New startup program inside `with` statement.
             None means do not change startup program.
-
-    Returns:
-        None
     """
     if not isinstance(main_program, Program):
         raise TypeError("main_program should be Program")
@@ -1460,7 +1907,8 @@ def program_guard(main_program, startup_program=None):
 
 def get_var(name, program=None):
     """
-    Get a variable by name from the global block of a program
+    Get a variable by name from the global block of a program.
+    
     Args:
         name(str): name of the variable
         program(Program|None): program object.
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 61be39c25912604f842ef8a9a6ec5f0d1cf70257..c417ab393fca88d476d2f1fe83d12f99271d6883 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -19,33 +19,41 @@ __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
 def create_lod_tensor(data, lod, place):
-    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+    """
+    Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
     Create a lod tensor by doing the following:
+
     1. Check that the length-based input lod is valid.
+
     2. Convert the length-based lod to a offset-based LoD.
-    3. Copy the data from a numpy array, a list or a existing lod tensor to 
+
+    3. Copy the data from a numpy array, a list or a existing lod tensor to
        CPU or GPU device (based on input place).
+
     4. Set the level of detail (LoD) using the offset-based LoD.
     
-    Use example:
-    Suppose we want LoDTensor to hold data for sequences of word, where each word is
-    represented by an integer. If we want to create a LoDTensor to represent two 
-    sentences, one of 2 words, and one of 3 words. 
+    Examples:
 
-    Then 'data' can be a numpy array of integers with shape (5, 1).
-    'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
-    This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
-    inside the function call.
+        Suppose we want LoDTensor to hold data for sequences of word, where each
+        word is represented by an integer. If we want to create a LoDTensor to
+        represent two  sentences, one of 2 words, and one of 3 words.
 
-    Please refer to 
-    github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
-    for more details regarding LoD.
+        Then :code:`data` can be a numpy array of integers with shape (5, 1).
+        :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based input lod [[2, 3]] will be converted to
+        offset-based lod [[0, 2, 5]] inside the function call.
+
+    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
+    regarding LoD.
 
     Args:
-        data: a numpy array or a LoDTensor or a list holding the data to be copied.
-        lod: a list of lists indicating the length-based LoD info specified by the user. 
-        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
+            list holding the data to be  copied.
+        lod(list): a list of lists indicating the length-based LoD info
+            specified by the user.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
 
     Returns:
         A fluid LoDTensor object with tensor data and lod info.
@@ -77,31 +85,38 @@ def create_lod_tensor(data, lod, place):
 
 
 def create_random_int_lodtensor(lod, base_shape, place, low, high):
-    """Create a LoDTensor containing random integers.
+    """
+    Create a LoDTensor containing random integers.
 
-    This function is frequently used in the book examples. So we revised it based on 
-    the new create_lod_tensor API and put it here in the lod_tensor module to simplify 
-    the code. 
+    This function is frequently used in the book examples. So we revised it
+    based on the new create_lod_tensor API and put it here in the lod_tensor
+    module to simplify the code.
 
     The function does the following:
-    1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input 
-    and the shape of the basic element in 'base_shape'.
+
+    1. Calculate the overall shape of the LoDTensor based on the length-based
+       :code:`lod` input and the shape of the basic element in
+       :code:`base_shape`.
+
     2. Create a numpy array of this shape.
+
     3. Create the LoDTensor using create_lod_tensor API.
 
-    Suppose we want LoDTensor to hold data for sequences of word, where each word is
-    represented by an integer. If we want to create a LoDTensor to represent two 
-    sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input 
-    length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be 
-    [5, 1], holding 5 words for two sentences. 
+    Suppose we want LoDTensor to hold data for sequences of word, where each
+    word is represented by an integer. If we want to create a LoDTensor to
+    represent two sentences, one of 2 words, and one of 3 words. Then
+    'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
+    shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.
 
     Args:
-        data: a numpy array or a LoDTensor holding the data to be copied.
-        lod: a list of lists indicating the length-based LoD info specified by the user.
-        base_shape: the shape of the basic element to be held by the LoDTensor. 
-        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
-        low: the lower bound of the random integers.
-        high: the upper bound of the random integers.
+        lod(list): a list of lists indicating the length-based LoD info
+            specified by the user.
+        base_shape(list): the shape of the basic element to be held by the
+            LoDTensor.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
+        low(int): the lower bound of the random integers.
+        high(int): the upper bound of the random integers.
 
     Returns:
         A fluid LoDTensor object with tensor data and lod info. 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index bb9c6fdc60089fc2b43573a6421a6f9781d2d4a8..572475b483ff0341a97a91b6c5309fcf337dacbe 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -325,14 +325,14 @@ class Auc(MetricBase):
     """
 
     def __init__(self, name, curve='ROC', num_thresholds=200):
-        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        super(Auc, self).__init__(name=name)
         self._curve = curve
         self._num_thresholds = num_thresholds
         self._epsilon = 1e-6
-        self.tp_list = np.ndarray((num_thresholds, ))
-        self.fn_list = np.ndarray((num_thresholds, ))
-        self.tn_list = np.ndarray((num_thresholds, ))
-        self.fp_list = np.ndarray((num_thresholds, ))
+        self.tp_list = np.zeros((num_thresholds, ))
+        self.fn_list = np.zeros((num_thresholds, ))
+        self.tn_list = np.zeros((num_thresholds, ))
+        self.fp_list = np.zeros((num_thresholds, ))
 
     def update(self, labels, predictions, axis=1):
         if not _is_numpy_(labels):
@@ -350,12 +350,12 @@ class Auc(MetricBase):
             tp, fn, tn, fp = 0, 0, 0, 0
             for i, lbl in enumerate(labels):
                 if lbl:
-                    if predictions[i, 0] >= thresh:
+                    if predictions[i, 1] >= thresh:
                         tp += 1
                     else:
                         fn += 1
                 else:
-                    if predictions[i, 0] >= thresh:
+                    if predictions[i, 1] >= thresh:
                         fp += 1
                     else:
                         tn += 1
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index bbedf6fde0872fd32d81c103bf5fe61449b7f57b..9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -26,16 +26,87 @@ def simple_img_conv_pool(input,
                          filter_size,
                          pool_size,
                          pool_stride,
-                         act,
-                         param_attr=None,
+                         pool_padding=0,
                          pool_type='max',
+                         global_pooling=False,
+                         conv_stride=1,
+                         conv_padding=0,
+                         conv_dilation=1,
+                         conv_groups=1,
+                         param_attr=None,
+                         bias_attr=None,
+                         act=None,
                          use_cudnn=True,
                          use_mkldnn=False):
+    """
+    The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            feature channel.
+        filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise,
+            the filter_size_H = filter_size_W = filter_size.
+        pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W).
+            Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+        pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or
+            tuple, it must contain two integers, (pool_padding_H, pool_padding_W).
+            Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
+            pool_size and pool_padding while be ignored. Default False
+        conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a
+            list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
+            the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or  tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
+            Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
+        conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is
+            a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
+            Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
+        conv_groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act (str): Activation type for Conv2d. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The result of input after Convolution2d and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.simple_img_conv_pool(input=img,
+                                                        filter_size=5,
+                                                        num_filters=20,
+                                                        pool_size=2,
+                                                        pool_stride=2,
+                                                        act="relu")
+    """
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        stride=conv_stride,
+        padding=conv_padding,
+        dilation=conv_dilation,
+        groups=conv_groups,
         param_attr=param_attr,
+        bias_attr=bias_attr,
         act=act,
         use_cudnn=use_cudnn,
         use_mkldnn=use_mkldnn)
@@ -45,6 +116,8 @@ def simple_img_conv_pool(input,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
+        pool_padding=pool_padding,
+        global_pooling=global_pooling,
         use_cudnn=use_cudnn,
         use_mkldnn=use_mkldnn)
     return pool_out
@@ -60,11 +133,65 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0.0,
                    pool_stride=1,
-                   pool_type=None,
+                   pool_type="max",
                    use_cudnn=True,
                    use_mkldnn=False):
     """
-    Image Convolution Group, Used for vgg net.
+    The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
+    and Pool2d. According to the input arguments, img_conv_group will do serials of
+    computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
+    result to Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
+        pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
+        conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
+        conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
+            Default: None.
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
+            If conv_with_batchnorm is a list, its length must be equal to the length of
+            conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
+            Conv2d Layer follows a BatchNorm. Default False.
+        conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
+            after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
+            equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
+            Layers is conv_batchnorm_drop_rate. Default 0.0.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H,
+            pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+            Default 1.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The final result after serial computation using Convolution2d,
+            BatchNorm, DropOut, and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.img_conv_group(input=img,
+                                                  num_channels=3,
+                                                  conv_padding=1,
+                                                  conv_num_filter=[3, 3],
+                                                  conv_filter_size=3,
+                                                  conv_act="relu",
+                                                  pool_size=2,
+                                                  pool_stride=2)
     """
     tmp = input
     assert isinstance(conv_num_filter, list) or \
@@ -74,6 +201,7 @@ def img_conv_group(input,
         if not hasattr(obj, '__len__'):
             return [obj] * len(conv_num_filter)
         else:
+            assert len(obj) == len(conv_num_filter)
             return obj
 
     conv_padding = __extend_list__(conv_padding)
@@ -119,6 +247,39 @@ def sequence_conv_pool(input,
                        param_attr=None,
                        act="sigmoid",
                        pool_type="max"):
+    """
+    The sequence_conv_pool is composed with Sequence Convolution and Pooling.
+
+    Args:
+        input (Variable): The input of sequence_conv, which supports variable-time
+            length input sequence. The underlying of input is a matrix with shape
+            (T, N), where T is the total time steps in this mini-batch and N is
+            the input_hidden_size
+        num_filters(int): The number of filter.
+        filter_size (int): The filter size.
+        param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None.
+        act (str): Activation type for Sequence_conv Layer. Default: "sigmoid".
+        pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
+            average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
+            Default :math:`max`.
+
+    Return:
+        Variable: The final result after Sequence Convolution and Pooling.
+
+    Examples:
+        .. code-block:: python
+
+            input_dim = len(word_dict)
+            emb_dim = 128
+            hid_dim = 512
+            data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
+            seq_conv = fluid.nets.sequence_conv_pool(input=emb,
+                                                     num_filters=hid_dim,
+                                                     filter_size=3,
+                                                     act="tanh",
+                                                     pool_type="sqrt")
+    """
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
@@ -132,9 +293,9 @@ def sequence_conv_pool(input,
 
 def glu(input, dim=-1):
     """
-    The gated linear unit composed by split, sigmoid activation and elementwise
-    multiplication. Specifically, Split the input into two equal sized parts
-    :math:`a` and :math:`b` along the given dimension and then compute as
+    The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
+    multiplication. Specifically, Split the input into two equal sized parts,
+    :math:`a` and :math:`b`, along the given dimension and then compute as
     following:
 
         .. math::
@@ -147,16 +308,16 @@ def glu(input, dim=-1):
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
         dim (int): The dimension along which to split. If :math:`dim < 0`, the
-            dimension to split along is :math:`rank(input) + dim`.
+            dimension to split along is :math:`rank(input) + dim`. Default -1.
 
     Returns:
-        Variable: The Tensor variable with half the size of input.
+        Variable: Variable with half the size of input.
 
     Examples:
         .. code-block:: python
 
-            # x is a Tensor variable with shape [3, 6, 9]
-            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
+            data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32")
+            output = fluid.nets.glu(input=data, dim=1)  # shape of output: [3, 3, 9]
     """
 
     a, b = layers.split(input, num_or_sections=2, dim=dim)
@@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries,
     <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
     Args:
-
         queries (Variable): The input variable which should be a 3-D Tensor.
         keys (Variable): The input variable which should be a 3-D Tensor.
         values (Variable): The input variable which should be a 3-D Tensor.
         num_heads (int): Head number to compute the scaled dot product
-                         attention. Default value is 1.
+            attention. Default: 1.
         dropout_rate (float): The dropout rate to drop the attention weight.
-                              Default value is 0.
+            Default: 0.0.
 
     Returns:
-
-        Variable: A 3-D Tensor computed by multi-head scaled dot product \
-                  attention.
+        Variable: A 3-D Tensor computed by multi-head scaled dot product\
+            attention.
 
     Raises:
-
         ValueError: If input queries, keys, values are not 3-D Tensors.
 
-    NOTE:
+    NOTES:
         1. When num_heads > 1, three linear projections are learned respectively
-        to map input queries, keys and values into queries', keys' and values'.
-        queries', keys' and values' have the same shapes with queries, keys
-        and values.
-
-        1. When num_heads == 1, scaled_dot_product_attention has no learnable
-        parameters.
+           to map input queries, keys and values into queries', keys' and values'.
+           queries', keys' and values' have the same shapes with queries, keys
+           and values.
+        2. When num_heads == 1, scaled_dot_product_attention has no learnable
+           parameters.
 
     Examples:
         .. code-block:: python
 
-            # Suppose q, k, v are Tensors with the following shape:
-            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-
-            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            queries = fluid.layers.data(name="queries",
+                                        shape=[3, 5, 9],
+                                        dtype="float32",
+                                        append_batch_size=False)
+            queries.stop_gradient = False
+            keys = fluid.layers.data(name="keys",
+                                     shape=[3, 6, 9],
+                                     dtype="float32",
+                                     append_batch_size=False)
+            keys.stop_gradient = False
+            values = fluid.layers.data(name="values",
+                                       shape=[3, 6, 10],
+                                       dtype="float32",
+                                       append_batch_size=False)
+            values.stop_gradient = False
+            contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
             contexts.shape  # [3, 5, 10]
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 0fdc9a035292b3390cece6c5821a60b1b281e54d..25cc1355d5a53e44b7f45c1f7d80673abcf567ec 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -27,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
 class ParallelExecutor(object):
+    """
+    ParallelExecutor can run program in parallel.
+
+    Args:
+        use_cuda (bool): Whether to use CUDA or not.
+        loss_name (str): The loss name must set in training. Default None.
+        main_program (Program): The program that need to run, if not provided,
+            then default_main_program will be used. Default None.
+        share_vars_from(ParallelExecutor): If provied, it will share variables
+            from the specified ParallelExecutor. Default None.
+        num_trainers(int): If greater than 1, NCCL will be initialized with
+            multiple rank of nodes, each node should have same number of GPUs.
+            Distributed training will be enabled then. Default 1.
+        trainer_id(int: Must use together with num_trainers. trainer_id is the
+            "rank" of current node starts from 0. Default 0.
+
+    Returns:
+        ParallelExecutor: The initialized ParallelExecutor object.
+
+    Raises:
+        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+
+    Examples:
+        .. code-block:: python
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+          test_loss, = test_exe.run([loss.name], feed=feed_dict)
+    """
+
     def __init__(self,
                  use_cuda,
                  loss_name=None,
@@ -37,42 +71,6 @@ class ParallelExecutor(object):
                  num_trainers=1,
                  trainer_id=0,
                  **kwargs):
-        """
-        ParallelExecutor can run program in parallel.
-
-        Args:
-            use_cuda(bool): Whether to use CUDA or not.
-            loss_name(str, default None): The loss name must set in training.
-            main_program(Program, default None): The program that need to run,
-                if not provided, then default_main_program will be used.
-            share_vars_from(ParallelExecutor, default None): If provied,
-                it will share variables from the specified ParallelExecutor.
-            num_trainers(int, default 1): If greater than 1, NCCL will be
-                initialized with multpile rank of nodes, each node should have
-                same number of GPUs. Distributed training will be enabled then.
-            trainer_id(int, default 0): Must use together with num_trainers.
-                trainer_id is the "rank" of current node starts from 0.
-
-        Returns:
-            A ParallelExecutor object.
-
-        Raises:
-            TypeError: If share_vars_from is provided, but not ParallelExecutor
-                object.
-
-        Examples:
-            .. code-block:: python
-
-              train_exe = fluid.ParallelExecutor(
-                  use_cuda=True, loss_name=loss.name)
-              test_exe = fluid.ParallelExecutor(
-                  use_cuda=True,
-                  main_program=test_program,
-                  share_vars_from=train_exe)
-
-              train_loss, = train_exe.run([loss.name], feed=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed=feed_dict)
-        """
         if len(kwargs) != 0:
             err_msg = ""
             for key in kwargs:
@@ -131,10 +129,16 @@ class ParallelExecutor(object):
         main = main_program
         main = main if main else framework.default_main_program()
         scope = executor.global_scope()
+        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
+        # train program, call self.bcast_param() at the end of each mini-batch.
+        self.is_dist = True if "recv" in [
+            op.type for op in main.global_block().ops
+        ] else False
 
         if share_vars_from and not isinstance(share_vars_from,
                                               ParallelExecutor):
             raise TypeError("share_vars_from must be ParallelExecutor.")
+
         local_scopes = share_vars_from.executor.local_scopes(
         ) if share_vars_from else []
 
@@ -166,12 +170,14 @@ class ParallelExecutor(object):
         element in the list will be copied to each device directly.
 
         For example, if the feed is a dict:
+
         >>> exe = ParallelExecutor()
         >>> # the image will be splitted into devices. If there is two devices
         >>> # each device will process an image with shape (24, 1, 28, 28)
         >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
 
         For example, if the feed is a list:
+
         >>> exe = ParallelExecutor()
         >>> # each device will process each element in the list.
         >>> # the 1st device will process an image with shape (48, 1, 28, 28)
@@ -182,18 +188,40 @@ class ParallelExecutor(object):
         >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
         >>>              ])
 
-
         Args:
             fetch_list(list): The fetched variable names
             feed(list|dict|None): The feed variables. If the feed is a dict,
                 tensors in that dict will be splitted into each devices. If
                 the feed is a list, each element of the list will be copied
-                to each device.
+                to each device. Default None.
             feed_dict: Alias for feed parameter, for backward compatibility.
-                This parameter is deprecated.
+                This parameter has been deprecated. Default None.
+
+        Returns:
+            List: The fetched result list.
+
+        Raises:
+            ValueError: If the feed is a list, but its length is not equal the
+                length of active places, or its element's is not dict.
+
+        NOTES:
+            1. If the feed's type is dict, the number of data that feeds to
+               ParallelExecutor must be bigger than active places. Otherwise,
+               it will throw exception from C++ side. Special attention should be
+               paid to check whether the last batch of the dataset is bigger
+               than active places.
+            2. If active places are more than one, the fetch results for each
+               variable is a list, and each element of this list is the variable of
+               respective active place.
 
-        Returns: fetched result list.
+        Examples:
+            .. code-block:: python
 
+                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            loss_name=avg_cost.name,
+                                            main_program=fluid.default_main_program())
+                loss = pe.run(feed=feeder.feed(cur_batch),
+                              fetch_list=[avg_cost.name]))
         """
         if feed is None and feed_dict is not None:
             feed = feed_dict
@@ -238,9 +266,17 @@ class ParallelExecutor(object):
         fetch_var_name = '@FETCHED_VAR_NAME@'
         self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if self.is_dist:
+            self.bcast_params()
+
         return [arr[i] for i in range(len(arr))]
 
     def bcast_params(self):
+        """
+        Broadcast the parameters to other devices. It is used during
+        distributed training.
+        """
         self.executor.bcast_params(set(self.persistable_vars))
 
     @property
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 1c6970441bccdc1c1221503256c30c83502bd123..0a42b9fca8dba7a11b414990be6c04c93158864f 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -22,6 +22,35 @@ __all__ = [
 
 
 class ParamAttr(object):
+    """
+    Parameter attributes object. To fine-tuning network training process, user
+    can set parameter's attributes to control training details. Such as learning rate,
+    regularization, trainable, do_model_average and the method to initialize param.
+
+
+    Args:
+        name(str): The parameter's name. Default None.
+        initializer(Initializer): The method to initial this parameter. Default None.
+        learning_rate(float): The parameter's learning rate. The learning rate when
+            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
+            Default 1.0.
+        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
+        trainable(bool): Whether this parameter is trainable. Default True.
+        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
+            gradient. Default None.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = fluid.ParamAttr(name="fc_weight",
+                                            learning_rate=0.5,
+                                            regularizer=fluid.L2Decay(1.0),
+                                            trainable=True)
+            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
+    """
+
     def __init__(self,
                  name=None,
                  initializer=None,
@@ -29,7 +58,7 @@ class ParamAttr(object):
                  regularizer=None,
                  trainable=True,
                  gradient_clip=None,
-                 do_model_average=None):
+                 do_model_average=False):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
@@ -39,6 +68,16 @@ class ParamAttr(object):
         self.model_average = do_model_average
 
     def set_default_initializer(self, initializer):
+        """
+        Set the default initializer, the initializer should be Constant,
+        Uniform, Normal, Xavier, MSRA.
+
+        Args:
+            initializer(Initializer): the initializer to set.
+
+        Returns:
+            None
+        """
         if initializer is None:
             if self.initializer is None:
                 raise ValueError("ParamAttr.initializer is not set")
@@ -50,13 +89,45 @@ class ParamAttr(object):
         self.initializer = initializer
 
     def set_default_param_initializer(self):
+        """
+        Set the default initializer for the parameter with Xavier.
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
         self.set_default_initializer(Xavier())
 
     def set_default_bias_initializer(self):
+        """
+        Set the default initializer for the bias with Constant(0.0).
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
         self.set_default_initializer(Constant(0.0))
 
     @staticmethod
     def to_attr(arg):
+        """
+        Create ParamAttr[s].
+
+        Args:
+            arg: Arguments to initialize ParamAttr[s]. arg's type can be
+                str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr,
+                bool, ParamAttr, or a list of above type.
+
+        Returns:
+            ParamAttr[s]: ParamAttr[s] initialized with arg.
+
+        Raises:
+            arg can not initialize a ParamAttr.
+        """
         if arg is None:
             return ParamAttr()
         elif isinstance(arg, list) or isinstance(arg, tuple):
@@ -75,6 +146,15 @@ class ParamAttr(object):
             raise TypeError("{0} cast to ParamAttr".format(type(arg)))
 
     def to_kwargs(self, with_initializer=False):
+        """
+        Returns the attributes of this parameter.
+
+        Args:
+            with_initializer(bool): Whether to add initializer attr.
+
+        Returns:
+            Parameter attributes(map): The attributes of this parameter.
+        """
         kwargs = {
             'name': self.name,
             'optimize_attr': {
@@ -92,9 +172,27 @@ class ParamAttr(object):
 
 class WeightNormParamAttr(ParamAttr):
     """
-    Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except
-    which to normalize.
+    Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
+    in a neural network that decouples the length of those weight vectors from
+    their direction. Weight Norm has been implemented as discussed in this
+    paper: `Weight Normalization: A Simple Reparameterization to Accelerate
+    Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Args:
+        dim(list): The parameter's name. Default None.
+        kwargs: Any field in ParamAttr. Default None.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data,
+                                 size=1000,
+                                 param_attr=WeightNormParamAttr(
+                                      dim=None,
+                                      name='weight_norm_param'))
+
     """
     # List to record the parameters reparameterized by weight normalization.
     # If these parameters are treated as Variable rather than Parameter,
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 8d48e9abef0fb9861284c6302b30efb0e3994989..bd57772713057f12b876942de58ee43527e94834 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -36,6 +36,45 @@ def convert_reader_to_recordio_file(
         compressor=core.RecordIOWriter.Compressor.Snappy,
         max_num_records=1000,
         feed_order=None):
+    """
+    Convert a Python Reader to a recordio file.
+
+    Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
+    details.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>> import paddle
+        >>>
+        >>> tmp_program = fluid.Program()
+        >>> with fluid.program_guard(tmp_program):
+        >>>     img = fluid.layers.data(name='img', shape=[784])
+        >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+        >>> # mnist.recordio will be generated in current directory
+        >>> fluid.recordio_writer.convert_reader_to_recordio_file(
+        >>>                     filename="mnist.recordio",
+        >>>                     reader_creator=paddle.batch(mnist.train(), batch_size=32),
+        >>>                     feeder=feeder)
+
+    Args:
+        filename(str): The recordio filename.
+        reader_creator(callable): The Python Reader Creator. See
+            :ref:`api_guide_python_reader`.
+        feeder(DataFeeder): The DataFeeder instance. Used to convert
+            :code:`reader_creator` to :code: `lod_tensor`
+        compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or
+            fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy`
+            by default.
+        max_num_records(int): Maximum number of records in one chuck. Each record
+            is each return value from reader function
+        feed_order(list): The order of variable names that the reader returns
+
+    Returns:
+        int: the number of record that saved.
+    """
     if feed_order is None:
         feed_order = feeder.feed_names
     counter = 0
@@ -58,6 +97,17 @@ def convert_reader_to_recordio_files(
         compressor=core.RecordIOWriter.Compressor.Snappy,
         max_num_records=1000,
         feed_order=None):
+    """
+    convert a python reader to many recordio files.
+
+    This API is basically same as :code:`convert_reader_to_recordio_file`,
+    instead of it will create many recordio files. Each file contains at
+    most :code:`batch_per_file` records.
+
+    Please reference
+    :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more
+    details.
+    """
     if feed_order is None:
         feed_order = feeder.feed_names
     f_name, f_ext = os.path.splitext(filename)
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 87c11e7880e73b911f21dda77c1cc2b4850b3591..b04f25ef874cc6204211a4f5f5991a0ec8c473dd 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
 
 
 def bilinear_interp_np(input, out_h, out_w, out_size):
@@ -45,9 +46,9 @@ def bilinear_interp_np(input, out_h, out_w, out_size):
 
             out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
                                         w1lambda*input[:, :, h, w+wid]) + \
-                              h1lambda*(w2lambda*input[:, :, h+hid, w] +
-                                        w1lambda*input[:, :, h+hid, w+wid])
-    return out.astype("float32")
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+    return out.astype(input.dtype)
 
 
 class TestBilinearInterpOp(OpTest):
@@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp):
         self.out_size = np.array([65, 129]).astype("int32")
 
 
+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index b4cb019aea347fcd2207829fe5b5b065159f7882..ad3872ab0d6ff82db12b436b8b729eead6a3e16d 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -33,23 +33,59 @@ __all__ = [
 
 
 class BeginEpochEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
     def __init__(self, epoch_id):
         self.epoch = epoch_id
 
 
 class EndEpochEvent(object):
+    """
+    The end of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
     def __init__(self, epoch_id):
         self.epoch = epoch_id
 
 
 class BeginStepEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+    """
+
     def __init__(self, epoch_id, step_id):
         self.epoch = epoch_id
         self.step = step_id
         self.fetch_metrics = True
+        """
+        If fetch_metrics is true, the metrics will be fetched at the 
+        EndStepEvent. Default is True.
+        """
 
 
 class EndStepEvent(object):
+    """
+    The end of a training step.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+        metrics(list): A list of fetched tensor. The order of this list is same
+            as the :code:`train_func` returns.
+    """
+
     def __init__(self, epoch_id, step_id, metrics):
         self.epoch = epoch_id
         self.step = step_id
@@ -57,6 +93,27 @@ class EndStepEvent(object):
 
 
 class CheckpointConfig(object):
+    """
+    Parameter object for :code:`fluid.io.save_checkpoint` and
+    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
+
+    Args:
+        checkpoint_dir(str): Directory path to save check point. Default is the
+            current directory.
+
+        max_num_checkpoints(int): The max number of local check points.
+        epoch_interval(int): Every number of epoch to save check point.
+        step_interval(int): Every number of step to save check point.
+
+    Examples:
+        >>> config = fluid.CheckpointConfig("./checkpoints")
+        >>> trainer = fluid.Trainer(train_func=train_program,
+        >>>                         place=place,
+        >>>                         optimizer_func=optimizer_func,
+        >>>                         checkpoint_config=config)
+        >>> trainer.train(...)
+    """
+
     def __init__(self,
                  checkpoint_dir=None,
                  max_num_checkpoints=3,
@@ -106,11 +163,62 @@ def check_and_get_place(place):
 
 class Trainer(object):
     """
+    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
+    simple neural network easily.
+
+    This API takes a :code:`train_func`. A :code:`train_func` is a function that
+    return loss as it first return value. The reset value can be fetched by
+    EndStepEvent.metrics
+
+    This API also takes a :code:`optimizer_func` that will return an optimizer
+    instance.
+
+    For example, to train a MLP for MNIST dataset, the sample program is
+
+    >>> import paddle.fluid as fluid
+    >>>
+    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
+    >>>     hidden = image
+    >>>     for layer_size in layer_sizes:
+    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
+    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
+    >>>
+    >>> def train_mnist_mlp():
+    >>>     img = fluid.layers.data(name='image', shape=[784])
+    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    >>>     prediction = mlp(img)
+    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
+    >>>
+    >>> def optimizer():
+    >>>     return fluid.optimizer.Adam()
+    >>>
+    >>> trainer = Trainer(train_func=train_mnist_mlp,
+    >>>                   optimizer_func=optimizer,
+    >>>                   place=fluid.CUDAPlace(0),
+    >>>                   parallel=True)
+    >>>
+    >>> def train_callback(event):
+    >>>     if isinstance(event, fluid.EndStepEvent):
+    >>>         print "Epoch ID", event.epoch, "Step ID",\
+    >>>             event.step, "AvgLoss", event.metrics[0]
+    >>>     elif isinstance(event, fluid.EndEpochEvent):
+    >>>         trainer.save_params("./model_{0}".format(event.epoch))
+    >>>
+    >>> trainer.train(num_epochs=100, event_handler=train_callback)
+
+    For more example, please see :ref:`api_guide_high_level_api`.
+
 
     Args:
-        train_func(callable): A function which will return loss. The loss must be a scalar.
+        train_func(callable): A function which will return loss. The loss must be
+            a scalar tensor.
         optimizer_func(callable): A function that returns an Optimizer object.
-        place: The device place of this trainer.
+        place(CUDAPlace|CPUPlace): The device place of this trainer. If
+            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
+            is a :code:`CUDAPlace`.
+        parallel(bool): True if use multiple devices.
+        checkpoint_config(CheckpointConfig): Configuration about how to save
+            checkpoints.
     """
 
     def __init__(self,
@@ -122,9 +230,6 @@ class Trainer(object):
                  checkpoint_config=None):
         self.__stop = False
         self.parallel = parallel
-        # 1. we need to generate a framework.Program by calling
-        # program_func. Reference: fluid.program_guard in
-        # test_word2vec.py
 
         # config for checkpoint
         # only chief worker will save variables
@@ -138,6 +243,10 @@ class Trainer(object):
 
         self.scope = core.Scope()
 
+        # 1. we need to generate a framework.Program by calling
+        # program_func. Reference: fluid.program_guard in
+        # test_word2vec.py
+
         self.startup_program = framework.Program()
         self.train_program = framework.Program()
 
@@ -280,17 +389,18 @@ class Trainer(object):
 
     def train(self, num_epochs, event_handler, reader=None, feed_order=None):
         """
-        Train the model.
+        Start the train loop to train the model.
 
         Args:
-            num_epochs: The number of epoch. An epoch will process all data in reader
-            event_handler: The event handler. A function with type (ev:Event)->void
-            reader:
-            feed_order: Feeding order of reader. None will following the defining
+            num_epochs(int): The number of epoch. An epoch will process all data in reader
+            event_handler(callable): The event handler. A function with type (ev:Event)->void
+            reader(callable): A reader creator object. See also
+                :ref:`api_guide_python_reader` .
+            feed_order(list): Feeding order of reader. None will following the defining
                 order in program
 
         Returns:
-
+            None
         """
         training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
         if training_role == "PSERVER":
@@ -310,16 +420,24 @@ class Trainer(object):
         Test the model on given test data
 
         Args:
-            reader: The reader that yields test data.
-            feed_order: Feeding order of reader. None will following the defining
-                order in program
+            reader(callable): The reader that yields test data.
+            feed_order(list): Feeding order of reader. None will following the
+                defining order in program
         """
 
         return self._test_by_executor(reader, feed_order,
                                       self.train_func_outputs)
 
     def save_params(self, param_path):
-        # reference: save_persistables in io.py
+        """
+        Save all parameters into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+
+        Returns:
+            None
+        """
         with self._prog_and_scope_guard():
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 5bbeeeaed666651300fdd2e29abed6a6062de44f..a3f0a4ffe28f145c33b10117f69f933f4b93be29 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -12,14 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Transpile the program to distributed data-parallelism programs.
-The main_program will be transformed to use a remote parameter server
-to do parameter optimization. And the optimization graph will be put
-into a parameter server program.
-
-Use different methods to split trainable variables to different
-parameter servers.
-
 Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
@@ -117,129 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
     return blocks
 
 
-class DistributeTranspiler:
-    def _has_distributed_lookup_table(self):
-        # process lookup_table_op
-        # 1. check all lookup_table_op is distributed
-        # 2. check all lookup_table_op share the same table.
-        distributed_lookup_table_ops = []
-        # support only one distributed_lookup_table now
-        self.table_name = None
-        for op in self.origin_program.global_block().ops:
-            if op.type == LOOKUP_TABLE_TYPE:
-                if op.attrs['is_distributed'] is True:
-                    if self.table_name is None:
-                        self.table_name = op.input("W")[0]
-                    if self.table_name != op.input("W")[0]:
-                        raise RuntimeError("all distributed lookup_table_ops"
-                                           " should have only one table")
-                    distributed_lookup_table_ops.append(op)
-                else:
-                    if self.table_name is not None:
-                        assert op.input("W")[0] != self.table_name
-
-        return len(distributed_lookup_table_ops) > 0
-
-    def _update_dist_lookup_table_vars(self, param_list, grad_list,
-                                       params_grads):
-        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
-        # update self.table_param_grad and self.trainer_side_table_grad_list
-        program = self.origin_program
-        if self.has_distributed_lookup_table:
-            param_list = [
-                param for param in param_list if param.name != self.table_name
-            ]
-            grad_list = [
-                grad for grad in grad_list
-                if grad.name != grad_var_name(self.table_name)
-            ]
-            self.table_param_grad = [
-                param_grad for param_grad in params_grads
-                if param_grad[0].name == self.table_name
-            ][0]
-            table_grad_var = self.table_param_grad[1]
-            if self.sync_mode:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.trainer_%d.pserver_%d" %
-                        (table_grad_var.name, self.trainer_id, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-            else:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.pserver_%d" % (table_grad_var.name, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-        return param_list, grad_list
-
-    def _init_splited_vars(self, slice_var_up):
-        # update these mappings for further transpile:
-        # 1. param_var_mapping: param var name -> [splited params vars]
-        # 2. grad_var_mapping: grad var name -> [splited grads vars]
-        # 3. grad_param_mapping: grad.blockx -> param.blockx
-        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
-
-        param_list = []
-        grad_list = []
-        param_grad_set = set()
-        for p, g in self.params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            if p.name not in param_grad_set:
-                param_list.append(p)
-                param_grad_set.add(p.name)
-            if g.name not in param_grad_set:
-                grad_list.append(g)
-                param_grad_set.add(g.name)
-
-        param_list, grad_list = self._update_dist_lookup_table_vars(
-            param_list, grad_list, self.params_grads)
-
-        if slice_var_up:
-            # when we slice var up into blocks, we will slice the var according to
-            # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
-            param_blocks = slice_variable(param_list,
-                                          len(self.pserver_endpoints))
-        else:
-            # when we do NOT slice var up into blocks, we will always slice params
-            # grads into one block.
-            grad_blocks = slice_variable(grad_list, 1)
-            param_blocks = slice_variable(param_list, 1)
-        assert (len(grad_blocks) == len(param_blocks))
-
-        # origin_varname -> [splited_var]
-        self.param_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program, param_blocks)
-        self.grad_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program,
-            grad_blocks,
-            add_trainer_suffix=self.trainer_num > 1)
-        self.grad_param_mapping = dict()
-        for g, p in zip(grad_blocks, param_blocks):
-            g_name, g_bid, _ = g.split(":")
-            p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
-                    self.param_var_mapping[p_name][int(p_bid)]
-
-        # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = dict()
-        [
-            self.param_grad_ep_mapping.update({
-                ep: {
-                    "params": [],
-                    "grads": []
-                }
-            }) for ep in self.pserver_endpoints
-        ]
+class DistributeTranspiler(object):
+    """
+    **DistributeTranspiler**
+
+    Convert the fluid program to distributed data-parallelism programs.
+
+    The main_program will be transformed to use a remote parameter server
+    to do parameter optimization. And the optimization graph will be put
+    into a parameter server program.
+
+    Examples:
+        .. code-block:: python
+
+           # Define your model before these codes.
+           port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+           pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+           eplist = []
+           for ip in pserver_ips.split(","):
+                eplist.append(':'.join([ip, port]))
+           pserver_endpoints = ",".join(eplist)
+           trainers = int(os.getenv("PADDLE_TRAINERS"))
+           current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+           trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+           role = os.getenv("PADDLE_TRAINING_ROLE")
+
+           t = distribute_transpiler.DistributeTranspiler()
+           t.transpile(
+                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+           if role == "PSERVER":
+                pserver_program = t.get_pserver_program(current_endpoint)
+                pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                                pserver_program)
+           elif role == "TRAINER":
+                trainer_program = t.get_trainer_program()
+    """
 
     def transpile(self,
                   trainer_id,
@@ -250,20 +154,20 @@ class DistributeTranspiler:
                   split_method=RoundRobin,
                   sync_mode=True):
         """
-        :param trainer_id: one unique id for each trainer in a job.
-        :type trainer_id: int
-        :param program: program to transpile, default is default_main_program
-        :type program: Program
-        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-        :type pservers: string
-        :param trainers: total number of workers/trainers in the job
-        :type trainers: int
-        :param split_method: A function to determin how to split variables
-            to different servers equally.
-        :type split_method: function
-        :type sync_mode: boolean default True
-        :param sync_mode: if sync_mode is set True, it means that dist transpiler
-        will transpile the program into sync_mode pserver and trainer program.
+        Run the transpiler.
+
+        Args:
+            trainer_id (int): id for current trainer worker, if you have
+                n workers, the id may range from 0 ~ n-1
+            program (Program|None): program to transpile,
+                default is fluid.default_main_program().
+            pservers (str): comma separated ip:port string for the pserver
+                list.
+            trainers (int): number of trainers in the distributed job.
+            slice_var_up (bool): Do Tensor slice for pservers, default is True.
+            split_method (PSDispatcher): RoundRobin or HashName can be used
+                try to choose the best method to balance loads for pservers.
+            sync_mode (bool): Do sync training or not, default is True.
         """
         assert (split_method.__bases__[0] == PSDispatcher)
         if program is None:
@@ -390,6 +294,12 @@ class DistributeTranspiler:
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
     def get_trainer_program(self):
+        """
+        Get transpiled trainer side program.
+
+        Returns:
+            Program: trainer side program.
+        """
         # remove optimize ops and add a send op to main_program
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
@@ -398,12 +308,19 @@ class DistributeTranspiler:
 
     def get_pserver_program(self, endpoint):
         """
-        Get pserver side program using the endpoint.
-        TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
-        NOTE: assume blocks of the same variable is not distributed
-        on the same pserver, only change param/grad varnames for
-        trainers to fetch.
+        Get parameter server side program.
+        
+        Args:
+            endpoint (str): current parameter server endpoint.
+        
+        Returns:
+            Program: the program for current parameter server to run.
         """
+        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
+        # NOTE: assume blocks of the same variable is not distributed
+        # on the same pserver, only change param/grad varnames for
+        # trainers to fetch.
+
         # step1
         pserver_program = Program()
         # step2: Create vars to receive vars at parameter servers.
@@ -585,6 +502,14 @@ class DistributeTranspiler:
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
         were split to several blocks.
+
+        Args:
+            endpoint (str): current pserver endpoint.
+            pserver_program (Program): call get_pserver_program first and
+                pass the result here.
+        
+        Returns:
+            Program: parameter server side startup program.
         """
         s_prog = Program()
         orig_s_prog = default_startup_program()
@@ -636,6 +561,129 @@ class DistributeTranspiler:
 
     # ====================== private transpiler functions =====================
 
+    def _has_distributed_lookup_table(self):
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in self.origin_program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        return len(distributed_lookup_table_ops) > 0
+
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+        return param_list, grad_list
+
+    def _init_splited_vars(self, slice_var_up):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        param_grad_set = set()
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
+
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)
+
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
+        assert (len(grad_blocks) == len(param_blocks))
+
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
+
+        # create mapping of endpoint -> split var to create pserver side program
+        self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
     # transpiler function for dis lookup_table
     def _replace_lookup_table_op_with_prefetch(self, program,
                                                pserver_endpoints):
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 8bfb554845d9b128f000d6c90cf626416a198eef..999ef43ca0feacbddff5f9db59589ce7097fe77e 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -383,6 +383,16 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
 
 
 def release_memory(input_program, skip_opt_set=None):
+    """
+    Modify the input program and insert :code:`delete_op` to early drop not used
+    variables. The modification will be performed inplace.
+
+    Notes: This is an experimental API and could be removed in next few
+    releases. Users should not use this API.
+
+    Args:
+        input_program(Program): The program will be inserted :code:`delete_op`.
+    """
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.release_memory(skip_opt_set=skip_opt_set)
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index d6a68677527deb09ace0e3a23cbc093d6d7b4349..dcffadd531719431f27feb464ed58a65c04770ee 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -33,15 +33,21 @@ class PSDispatcher(object):
 
     def dispatch(self, varlist):
         """
-        :param varlist: a list of Variables
-        :return: a map of pserver endpoint -> varname 
+        Args:
+            varlist(list): a list of Variables
+        Returns:
+            a map of pserver endpoint -> varname
         """
         AssertionError("Interface has not been implemented.")
 
 
 class HashName(PSDispatcher):
     """
-      Hash variable names to several endpoints
+    Hash variable names to several endpoints using python
+    "hash()" function.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
     """
 
     def __init__(self, pserver_endpoints):
@@ -61,7 +67,11 @@ class HashName(PSDispatcher):
 
 class RoundRobin(PSDispatcher):
     """
-    Distribute variables to serveral endpoints.
+    Distribute variables to serveral endpoints using
+    RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
     """
 
     def __init__(self, pserver_endpoints):
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 33c53113ae7e8ed9aeada31f2aed6990b6fea110..776619cd36722e338a9fdd5e13bceeaf3724de2c 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -16,7 +16,7 @@ import collections
 import contextlib
 import sys
 
-__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator']
+__all__ = ['generate', 'switch', 'guard']
 
 
 class UniqueNameGenerator(object):
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de76c381b29a1ff8dcf2167f0e861dc261aa47b
--- /dev/null
+++ b/tools/check_ctest_hung.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+
+
+def escape(input):
+    o = input.replace("\n", "")
+    o = o.replace("\r", "")
+    return o
+
+
+def main():
+    usage = """Usage:
+1. Download the Paddle_PR_CI_*.log from TeamCity
+2. run: python check_ctest_hung.py Paddle_PR_CI_*.log
+3. If there is hung ctest, the result likes:
+Diff:  set(['test_parallel_executor_crf'])
+    """
+    if len(sys.argv) < 2:
+        print(usage)
+        exit(0)
+
+    logfile = sys.argv[1]
+    started = set()
+    passed = set()
+    with open(logfile, "r") as fn:
+        for l in fn.readlines():
+            if l.find("Test ") != -1 and \
+                l.find("Passed") != -1:
+                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
+                passed.add(m.group(1))
+            if l.find("Start ") != -1:
+                start_parts = escape(l).split(" ")
+                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
+                started.add(m.group(1))
+    print "Diff: ", started - passed
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clang_format.hook b/tools/codestyle/clang_format.hook
similarity index 100%
rename from .clang_format.hook
rename to tools/codestyle/clang_format.hook
diff --git a/.copyright.hook b/tools/codestyle/copyright.hook
similarity index 100%
rename from .copyright.hook
rename to tools/codestyle/copyright.hook
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
index 54a690462699651d3e14f9b24383df01a9740336..8d4b24a0cf6b743b72dca58fd885f927560964bf 100644
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -291,6 +291,8 @@ class DocstringChecker(BaseChecker):
             True if successful otherwise False.
         """
 
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
         find = False
         for t in node.body:
             if not isinstance(t, astroid.Return):
@@ -316,6 +318,8 @@ class DocstringChecker(BaseChecker):
         Returns:
             True if successful otherwise False.
         """
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
         args = []
         for arg in node.args.get_children():
             if (not isinstance(arg, astroid.AssignName)) \