Merge branch 'develop' of github.com:PaddlePaddle/Paddle into fix_pserver_sub_blocks

e02cbf35 · Yancey1989 · 5c7d6a55 · 6e1c48d1 · e02cbf35 · e02cbf35
63 changed file
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Use UBUNTU_MIRROR can speed up apt-get speed.
+# ARG UBUNTU_MIRROR
+# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
-RUN pip install -U pip
-RUN pip install -U kubernetes paddlepaddle

 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
+
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle

 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace

 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s

 ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+RUN pip install /*.whl && rm -f /*.whl 

 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD models/ /workspace/models/
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
        return train_program, fluid.default_startup_program()
    else:
        raise ValueError(
-            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )


@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    break
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
-            if args.update_method == "pserver":
-                exe.bcast_params()
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
          (num_samples, train_elapsed, examples_per_sec))


+def print_paddle_envs():
+    print('----------- Configuration envs -----------')
+    for k in os.environ:
+        if "PADDLE_" in k:
+            print "ENV %s:%s" % (k, os.environ[k])
+    print('------------------------------------------------')
+
+
 def main():
    args = parse_args()
    print_arguments(args)
+    print_paddle_envs()

    # the unique trainer id, starting from 0, needed by trainer
    # only

--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -17,6 +17,7 @@ import copy
 import argparse
 import random
 import os
+import copy
 from kube_templates import pserver, trainer, envs


@@ -108,10 +109,9 @@ def gen_job():
    tn_container["ports"][0]["containerPort"] = spreadport

    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
@@ -166,17 +166,23 @@ def gen_job():
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts

-    ps_container["env"] = envs
-    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    ps_container["env"] = copy.deepcopy(envs)
+    ps_container["env"].append({
+        "name": "PADDLE_TRAINING_ROLE",
+        "value": "PSERVER"
+    })
    tn_container["env"] = envs
    if args.disttype == "pserver":
        tn_container["env"].append({
-            "name": "TRAINING_ROLE",
+            "name": "PADDLE_TRAINING_ROLE",
            "value": "TRAINER"
        })
    elif args.disttype == "nccl2" or args.disttype == "local":
        # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "WORKER"
+        })

    os.mkdir(args.jobname)
    if args.disttype == "pserver":

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst

-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+transpiler
+==========
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+
+InferenceTranspiler
+-------------------
+
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+
+HashName
+--------
+
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+
+RoundRobin
+----------
+
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book

 第二步，启动Parameter Server：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
 ```
 执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。

 第三步，启动Trainer：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
 ```
 由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。


--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
           ret_list.append(f)
   return ret_list

-trainers = int(os.getenv("TRAINERS"))
-trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
 data_file = fluid.layers.io.open_files(
    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
    thread_num=1,

--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -13,6 +13,7 @@ cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
 cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
 ======================   ========================================

 从源码编译

--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -40,10 +40,9 @@ void Main(bool use_gpu) {
    //# 2. Prepare input.
    int64_t data[4] = {1, 2, 3, 4};

-    PaddleBuf buf{.data = data, .length = sizeof(data)};
    PaddleTensor tensor{.name = "",
                        .shape = std::vector<int>({4, 1}),
-                        .data = buf,
+                        .data = PaddleBuf(data, sizeof(data)),
                        .dtype = PaddleDType::INT64};

    // For simplicity, we set all the slots with the same data.
@@ -55,14 +54,12 @@ void Main(bool use_gpu) {

    //# 4. Get output.
    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
-    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
    }
-    // TODO(Superjomn): this is should be free automatically
-    free(outputs[0].data.data);
  }
 }

@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) {
      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
        // 2. Dummy Input Data
        int64_t data[4] = {1, 2, 3, 4};
-        PaddleBuf buf{.data = data, .length = sizeof(data)};
        PaddleTensor tensor{.name = "",
                            .shape = std::vector<int>({4, 1}),
-                            .data = buf,
+                            .data = PaddleBuf(data, sizeof(data)),
                            .dtype = PaddleDType::INT64};
        std::vector<PaddleTensor> inputs(4, tensor);
        std::vector<PaddleTensor> outputs;
@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) {
        // 4. Get output.
        ASSERT_EQ(outputs.size(), 1UL);
        LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length;
-        const size_t num_elements = outputs.front().data.length / sizeof(float);
+                  << "output buffer size: " << outputs.front().data.length();
+        const size_t num_elements =
+            outputs.front().data.length() / sizeof(float);
        // The outputs' buffers are in CPU memory.
        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
        }
-        free(outputs[0].data.data);
      }
    });
  }

--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  assert(!other.memory_owned_);
+  data_ = other.data_;
+  length_ = other.length_;
+  memory_owned_ = other.memory_owned_;
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  assert(memory_owned_);
+  Free();
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -21,6 +21,7 @@ limitations under the License. */

 #pragma once

+#include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
@@ -32,12 +33,38 @@ enum PaddleDType {
  INT64,
 };

-struct PaddleBuf {
-  void* data;     // pointer to the data memory.
-  size_t length;  // number of memory bytes.
+class PaddleBuf {
+ public:
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
 };

 struct PaddleTensor {
+  PaddleTensor() = default;
  std::string name;  // variable name.
  std::vector<int> shape;
  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
@@ -67,8 +94,9 @@ class PaddlePredictor {

  // Predict an record.
  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be alive until Run returns. caller should be
-  // responsible for releasing the memory of `output_data`.
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
                   std::vector<PaddleTensor>* output_data) = 0;


--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
    auto d_tensor_in_p = executor_.get_in(input.name);
    float *d_data_p = d_tensor_in_p->mutable_data();
    if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data),
+                   static_cast<float *>(input.data.data()),
                   d_tensor_in_p->valid_size() * sizeof(float),
                   cudaMemcpyHostToDevice) != 0) {
      LOG(ERROR) << "copy data from CPU to GPU error";
@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
  for (auto &output : *output_data) {
    auto *tensor = executor_.get_out(output.name);
    output.shape = tensor->shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data,
+    if (cudaMemcpy(output.data.data(),
                   tensor->mutable_data(),
                   tensor->valid_size() * sizeof(float),
                   cudaMemcpyDeviceToHost) != 0) {

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -37,28 +37,26 @@ TEST(inference, anakin) {

  float data[1 * 3 * 224 * 224] = {1.0f};

-  PaddleBuf buf{.data = data, .length = sizeof(data)};
  PaddleTensor tensor{.name = "input_0",
                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = buf,
+                      .data = PaddleBuf(data, sizeof(data)),
                      .dtype = PaddleDType::FLOAT32};

  // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.emplace_back(std::move(tensor));

-  float data_out[1000];
-
-  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
  PaddleTensor tensor_out{.name = "prob_out",
                          .shape = std::vector<int>({1000, 1}),
-                          .data = buf_out,
+                          .data = PaddleBuf(),
                          .dtype = PaddleDType::FLOAT32};

-  std::vector<PaddleTensor> outputs(1, tensor_out);
+  std::vector<PaddleTensor> outputs;
+  outputs.emplace_back(std::move(tensor_out));

  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));

-  float* data_o = static_cast<float*>(outputs[0].data.data);
+  float* data_o = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < 1000; ++j) {
    LOG(INFO) << "output[" << j << "]: " << data_o[j];
  }

--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,

    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
    std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data,
-                inputs[i].data.length);
+                inputs[i].data.data(),
+                inputs[i].data.length());
    feeds->push_back(input);
  }
  return true;
@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch(
    }

    outputs->at(i).shape = shape;
-    outputs->at(i).data.length = sizeof(float) * data.size();
-    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
-    std::memcpy(
-        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    auto &buffer = outputs->at(i).data;
+    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
+      buffer.Resize(sizeof(float) * data.size());
+    }
+    std::memcpy(buffer.data(), data.data(), buffer.length());
    outputs->at(i).dtype = PaddleDType::FLOAT32;
    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
  }

--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -27,13 +27,12 @@ namespace paddle {

 PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
  PaddleTensor pt;
-  pt.data.data = t->data<void>();

  if (t->type() == typeid(int64_t)) {
-    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
    pt.dtype = PaddleDType::INT64;
  } else if (t->type() == typeid(float)) {
-    pt.data.length = t->numel() * sizeof(float);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
    pt.dtype = PaddleDType::FLOAT32;
  } else {
    LOG(FATAL) << "unsupported type.";
@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
  std::vector<PaddleTensor> outputs;
  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
  ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
-  float* data = static_cast<float*>(outputs[0].data.data);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < len / sizeof(float); ++j) {
    ASSERT_LT(data[j], 1.0);
    ASSERT_GT(data[j], -1.0);
@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
    EXPECT_LT(lod_data[i] - data[i], 1e-3);
    EXPECT_GT(lod_data[i] - data[i], -1e-3);
  }
-
-  free(outputs[0].data.data);
 }

 void MainImageClassification(bool use_gpu) {
@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
  std::vector<PaddleTensor> outputs;
  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
  ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
-  float* data = static_cast<float*>(outputs[0].data.data);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
  float* lod_data = output1.data<float>();
  for (size_t j = 0; j < len / sizeof(float); ++j) {
    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
  }
-  free(data);
 }

 void MainThreadsWord2Vec(bool use_gpu) {
@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {

      // check outputs range
      ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
      for (size_t j = 0; j < len / sizeof(float); ++j) {
        ASSERT_LT(data[j], 1.0);
        ASSERT_GT(data[j], -1.0);
@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
      for (int i = 0; i < refs[tid].numel(); ++i) {
        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
      }
-      free(data);
    });
  }
  for (int i = 0; i < num_jobs; ++i) {
@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {

      // check outputs correctness
      ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
      for (int i = 0; i < refs[tid].numel(); ++i) {
        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
      }
-      free(data);
    });
  }
  for (int i = 0; i < num_jobs; ++i) {

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 }

 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars) {
+                                  bool create_local_scope, bool create_vars,
+                                  bool keep_kids) {
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    }
  }
  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  if (create_vars && create_local_scope) {
+  if (local_scope != scope) {
    scope->DeleteScope(local_scope);
  } else {
-    // Delete the local scopes created in operators.
-    scope->DropKids();
+    if (!keep_kids) {
+      // By default, we should delete all kid scopes after run executor because
+      // some operators may create local scope when running, such as while_op.
+      // But when while_op also create a local executor to run it's sub block,
+      // the sub scopes it created should not be dropped immediately, because
+      // while_grad_op will use some variables created during while_op run, so
+      // we need to keep the kids and wait for the outer executor to drop them.
+      scope->DropKids();
+    }
  }
+
  if (FLAGS_benchmark) {
    VLOG(2) << "-------------------------------------------------------";
    VLOG(2) << "Memory used after deleting local scope: "

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -78,7 +78,7 @@ class Executor {

  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
-                          bool create_vars = true);
+                          bool create_vars = true, bool keep_kids = false);

  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          std::map<std::string, const LoDTensor*>* feed_targets,

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
  SubGraphFuse(graph, node_inside_subgraph_teller_);
 }

-}  // analysis
-}  // inference
+}  // namespace analysis
+}  // namespace inference

-}  // paddle
+}  // namespace paddle
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -143,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.

-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

@@ -385,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 STanh Activation Operator.

-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$

 )DOC");
  }

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -21,8 +21,6 @@ namespace operators {

 using batch_norm_bwd = mkldnn::batch_normalization_backward;
 using batch_norm_fwd = mkldnn::batch_normalization_forward;
-using framework::DataLayout;
-using framework::Tensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::reorder;
@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;

-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 namespace {
 template <typename T>
 struct bn_type_traits {

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -22,22 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class BatchNormOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -19,6 +19,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
                  ops::BilinearInterpOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
-REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
+                       ops::BilinearInterpKernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
                       ops::BilinearInterpGradKernel<float>);
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
    int in_chw = channels * in_hw;
    int out_chw = channels * out_hw;

-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;

    if (in_h == out_h && in_w == out_w) {
      memcpy(output, input, input_t->numel() * sizeof(T));
@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
        for (int i = 0; i < out_h; ++i) {     // loop for images
          int h = ratio_h * i;
          int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1.f - h1lambda;

          for (int j = 0; j < out_w; ++j) {
            int w = ratio_w * j;
            int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1.f - w1lambda;
            // calculate four position for bilinear interpolation
            const T* in_pos = &input[k * in_chw + h * in_w + w];
            T* out_pos = &output[k * out_chw + i * out_w + j];

            for (int c = 0; c < channels; ++c) {  // loop for channels
              // bilinear interpolation
-              out_pos[0] =
+              out_pos[0] = static_cast<T>(
                  h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
                  h1lambda * (w2lambda * in_pos[hid * in_w] +
-                              w1lambda * in_pos[hid * in_w + wid]);
+                              w1lambda * in_pos[hid * in_w + wid]));
              in_pos += in_hw;
              out_pos += out_hw;
            }
@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
    int in_chw = channels * in_hw;
    int out_chw = channels * out_hw;

-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;

    if (in_h == out_h && in_w == out_w) {
      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
        for (int i = 0; i < out_h; ++i) {     // loop for images
          int h = ratio_h * i;
          int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1 - h1lambda;

          for (int j = 0; j < out_w; ++j) {
            int w = ratio_w * j;
            int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1 - w1lambda;
            T* in_pos = &d_input[k * in_chw + h * in_w + w];
            const T* out_pos = &d_output[k * out_chw + i * out_w + j];

            for (int c = 0; c < channels; ++c) {  // loop for channels
-              in_pos[0] += h2lambda * w2lambda * out_pos[0];
-              in_pos[wid] += h2lambda * w1lambda * out_pos[0];
-              in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0];
-              in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0];
+              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
+              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
+              in_pos[hid * in_w] +=
+                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
+              in_pos[hid * in_w + wid] +=
+                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
              in_pos += in_hw;
              out_pos += out_hw;
            }

--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                              paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                               paddle::operators::LogicalXorFunctor);
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -209,7 +209,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {

    outputs_cols[0] = 0;
    for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i)->numel() / out_row;
+      int t_col = ref_inputs.at(i)->numel() / out_row;
      if (sameShape) {
        if (t_col != out0_col) sameShape = false;
      }

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;

 #define DEFINE_CPU_TRANS(RANK)                                             \
  template struct Transpose<platform::CPUDeviceContext, platform::float16, \

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -14,11 +14,14 @@

 #ifdef PADDLE_WITH_CUDA

-#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -16,10 +16,12 @@

 #ifdef PADDLE_WITH_CUDA

+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
                        const std::string& z_name, bool x_created,
                        const shape_t& x_shape, const shape_t& y_shape,
                        const shape_t& z_shape) {
-
    LOG(INFO) << "create fc op";
    auto* fc = block_desc.AppendOp();
    fc->SetType("mul");

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) {
             new (&instance) LoDTensor(new_offset_lod);
           })
      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
+      // We implement offset based LOD in C++ while we use length based with
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
+      // avoid misuse.
+      // The discussion is here:
+      // https://github.com/PaddlePaddle/Paddle/issues/10855
      .def("set_lod",
           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
             // the input lod is offset-based level-of-detail info
@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) {
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
             return new_lod;
           })
+      // Set above comments of set_lod.
      .def("recursive_sequence_lengths",
           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
             // output the length-based lod info

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
  auto buffer_info =
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
-                                  platform::float16>()(tensor);
+                                  uint8_t, platform::float16>()(tensor);
  return buffer_info;
 }


--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,7 +44,7 @@ import metrics
 import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
 from transpiler import DistributeTranspiler, InferenceTranspiler, \
    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
              'profiler',
              'unique_name',
              'recordio_writer',
+              'Scope',
          ]



--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var):


 class WeightedAverage(object):
+    """
+    Calculate weighted average.
+
+    The average calculating is accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+
+    Examples:
+        .. code-block:: python
+            avg = fluid.average.WeightedAverage()
+            avg.add(value=2.0, weight=1)
+            avg.add(value=4.0, weight=2)
+            avg.eval()
+
+            # The result is 3.333333333.
+            # For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
+    """
+
    def __init__(self):
        warnings.warn(
            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
            else:
                if len(renamed_vars[var_name]) == 1:
                    new_name = var_name + "@RENAME@" + \
-                               str(var_rename_count[var_name])
+                        str(var_rename_count[var_name])
                    var_rename_count[var_name] += 1
                    # rename original var_name
                    renamed_vars[var_name][0] = new_name
@@ -155,7 +155,7 @@ def _addup_repetitive_outputs_(op_descs):
                    _rename_arg_(pending_sum_ops, var_name, new_name)

                new_name = var_name + "@RENAME@" + \
-                           str(var_rename_count[var_name])
+                    str(var_rename_count[var_name])
                var_rename_count[var_name] += 1
                op_desc.rename_output(var_name, new_name)
                renamed_vars[var_name].append(new_name)
@@ -435,18 +435,65 @@ def _get_stop_gradients_(program):
 def append_backward(loss, parameter_list=None, no_grad_set=None,
                    callbacks=None):
    """
-    Append backward part to main_program
+    Append backward part to main_program.

-    Args:
-        loss(Variable): The variable generated by cost function.
-        parameter_list(list[string]): Parameters that need to be updated by
-            optimizer. If None, it means all parameters need to be updated.
-        no_grad_set(set): Variables that have no gradients in Block 0.
-            All variables with `step_gradient=True` from all blocks will be
-            automatically added.
+    A complete neural network training is made up of forward and backward 
+    propagation. However, when we configure a network, we only need to 
+    specify its forwrd part. The backward part is generated automatically 
+    according to the forward part by this function.

-    Return:
-        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
+    In most cases, users do not need to invoke this function manually. It 
+    will be automatically invoked by the optimizer's `minimize` function.
+
+    Args:
+        loss(Variable): The loss variable of the network.
+        parameter_list(list[string]|None): Names of parameters that need 
+                                           to be updated by optimizers. 
+                                           If it is None, all parameters 
+                                           will be updated.
+                                           Default: None
+        no_grad_set(set|None): Variables in the Block 0 whose gradients 
+                               should be ignored. All variables with 
+                               `step_gradient=True` from all blocks will 
+                               be automatically added into this set.
+                               Default: None
+        callbacks(list[callable object]|None): The callbacks are used for 
+                                               doing some custom jobs during 
+                                               backward part building. All 
+                                               callable objects in it will 
+                                               be invoked once each time a 
+                                               new gradient operator is added 
+                                               into the program. The callable 
+                                               object must has two input 
+                                               parameters: 'block' and 'context'. 
+                                               The 'block' is the block which 
+                                               the new gradient operator will 
+                                               be added to. The 'context' is a 
+                                               map, whose keys are gradient 
+                                               variable names and values are 
+                                               corresponding original variables.
+                                               In addition to this, the 'context' 
+                                               has another special key-value pair: 
+                                               the key is string '__current_op_desc__' 
+                                               and the value is the op_desc of the 
+                                               gradient operator who has just 
+                                               triggered the callable object. 
+
+    Returns:
+        list[(Variable,Variable)]: Pairs of parameter and its 
+        corresponding gradients. The key is the parameter and the 
+        value is gradient variable.
+
+    Raises:
+        AssertionError: If `loss` is not an instance of Variable.
+
+    Examples:
+        .. code-block:: python
+
+            # network configuration code
+            # ...
+            avg_loss = fluid.layers.mean(loss)
+            param_grad_list = fluid.backward.append_backward(loss=avg_loss)
    """
    assert isinstance(loss, framework.Variable)


--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object):
        self.place = place
        self.lod_level = lod_level
        self.shape = shape
+        negtive_count = 0
+        for s in self.shape:
+            if s < 0:
+                negtive_count += 1
+            if negtive_count > 1:
+                self.shape = None
+                break
        if dtype == core.VarDesc.VarType.FP32:
            self.dtype = 'float32'
        elif dtype == core.VarDesc.VarType.INT64:
@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object):
                self._feed_impl_(each_data, lod[1:], lod_level - 1)

    def done(self):
-        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        arr = numpy.array(self.data, dtype=self.dtype)
+        if self.shape:
+            arr = arr.reshape(self.shape)
        t = core.LoDTensor()
        t.set(arr, self.place)
        if self.lod_level > 0:
@@ -70,6 +79,61 @@ class DataToLoDTensorConverter(object):


 class DataFeeder(object):
+    """
+    DataFeeder converts the data that returned by a reader into a data
+    structure that can feed into Executor and ParallelExecutor. The reader
+    usually returns a list of mini-batch data entries. Each data entry in
+    the list is one sample. Each sample is a list or a tuple with one
+    feature or multiple features.
+
+    The simple usage shows below:
+
+    ..  code-block:: python
+
+        place = fluid.CPUPlace()
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+
+
+    If you want to feed data into GPU side separately in advance when you
+    use multi-GPU to train a model, you can use `decorate_reader` function.
+
+    ..  code-block:: python
+
+        place=fluid.CUDAPlace(0)
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(flowers.train(), batch_size=16))
+
+    Args:
+        feed_list(list): The Variables or Variables'name that will
+            feed into model.
+        place(Place): place indicates feed data into CPU or GPU, if you want to
+            feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
+            the GPU id), or if you want to feed data into CPU, please using
+            `fluid.CPUPlace()`.
+        program(Program): The Program that will feed data into, if program
+            is None, it will use default_main_program(). Default None.
+
+    Raises:
+        ValueError: If some Variable is not in this Program.
+
+    Examples:
+        .. code-block:: python
+
+            # ...
+            place = fluid.CPUPlace()
+            feed_list = [
+                main_program.global_block().var(var_name) for var_name in feed_vars_name
+            ] # feed_vars_name is a list of variables' name.
+            feeder = fluid.DataFeeder(feed_list, place)
+            for data in reader():
+                outs = exe.run(program=main_program,
+                               feed=feeder.feed(data))
+    """
+
    def __init__(self, feed_list, place, program=None):
        self.feed_dtypes = []
        self.feed_names = []
@@ -99,6 +163,16 @@ class DataFeeder(object):
        self.place = place

    def feed(self, iterable):
+        """
+        According to feed_list and iterable, converters the input into
+        a data structure that can feed into Executor and ParallelExecutor.
+
+        Args:
+            iterable(list|tuple): the input data.
+
+        Returns:
+            dict: the result of conversion.
+        """
        converter = []
        for lod_level, shape, dtype in six.zip(
                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
@@ -121,6 +195,20 @@ class DataFeeder(object):
        return ret_dict

    def feed_parallel(self, iterable, num_places=None):
+        """
+        Takes multiple mini-batches. Each mini-batch will be feed on each
+        device in advance.
+
+        Args:
+            iterable(list|tuple): the input data.
+            num_places(int): the number of devices. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Notes:
+            The number of devices and number of mini-batches must be same.
+        """
        if isinstance(self.place, core.CUDAPlace):
            places = [
                core.CUDAPlace(i)
@@ -159,6 +247,24 @@ class DataFeeder(object):
                        multi_devices,
                        num_places=None,
                        drop_last=True):
+        """
+        Converter the input data into a data that returned by reader into
+        multiple mini-batches. Each mini-batch will be feed on each device.
+
+        Args:
+            reader(fun): the input data.
+            multi_devices(bool): the number of places. Default None.
+            num_places(int): the number of places. Default None.
+            drop_last(bool): the number of places. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Raises:
+            ValueError: If drop_last is False and the data batch which cannot
+            fit for devices.
+        """
+
        def __reader_creator__():
            if not multi_devices:
                for item in reader():

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -25,6 +25,13 @@ g_scope = core.Scope()


 def global_scope():
+    """
+    Get the global/default scope instance. There are a lot of APIs use
+    :code:`global_scope` as its default value, e.g., :code:`Executor.run`
+
+    Returns:
+        Scope: The global/default scope instance.
+    """
    return g_scope


@@ -37,6 +44,19 @@ def switch_scope(scope):

 @contextlib.contextmanager
 def scope_guard(scope):
+    """
+    Change the global/default scope instance by Python `with` statement. All
+    variable in runtime will assigned to the new scope.
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> new_scope = fluid.Scope()
+        >>> with fluid.scope_guard(new_scope):
+        >>>     ...
+
+    Args:
+        scope: The new global/default scope.
+    """
    ex = switch_scope(scope)
    yield
    switch_scope(ex)
@@ -135,14 +155,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):

 def fetch_var(name, scope=None, return_numpy=True):
    """
-    Fetch the value of the variable with the given name from the given scope
+    Fetch the value of the variable with the given name from the
+    given scope.
+
    Args:
        name(str): name of the variable. Typically, only persistable variables
            can be found in the scope used for running the program.
        scope(core.Scope|None): scope object. It should be the scope where
            you pass to Executor.run() when running your program.
-            If None, global_scope() will be used.
-        return_numpy(bool): whether convert the tensor to numpy.ndarray
+            If None, global_scope() will be used. Default None.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray.
+            Default True.
+
    Returns:
       LodTensor|numpy.ndarray
    """

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -30,8 +30,6 @@ __all__ = [
    'default_startup_program',
    'default_main_program',
    'program_guard',
-    'switch_startup_program',
-    'switch_main_program',
    'get_var',
 ]

@@ -43,7 +41,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()

 def grad_var_name(var_name):
    """
-    return gradient name for a certain var name
+    Returns:
+        str: gradient name for a certain var name
    """
    return var_name + GRAD_VAR_SUFFIX

@@ -51,10 +50,12 @@ def grad_var_name(var_name):
 def convert_np_dtype_to_dtype_(np_dtype):
    """
    Convert the data type in numpy to the data type in Paddle
+
    Args:
-        np_dtype(np.dtype): the data type in numpy
+        np_dtype(np.dtype): the data type in numpy.

-    Returns(core.VarDesc.VarType): the data type in Paddle
+    Returns:
+        core.VarDesc.VarType: the data type in Paddle.

    """
    dtype = np.dtype(np_dtype)
@@ -120,37 +121,53 @@ def _debug_string_(proto, throw_on_error=True):

 class Variable(object):
    """
-    Python variable. Every input and output of an operator is a variable. Every
-    variable belongs to a block. The variable has a name and two variables in
-    different blocks could have the same name.
+    In Fluid, every input and output of an operator is a variable. In most 
+    cases, variables are used for holding different kinds of data or training 
+    labels. A variable belongs to a block. All variable has its own name and 
+    two variables in different blocks could have the same name.

-    There are many kinds of variables. Please reference the framework.proto for
-    details.
+    There are many kinds of variables. Each kind of them has its own attributes 
+    and usages. Please reference the framework.proto for details. 

-    Notes: The constructor of Variable should not be invoked directly. Please
-    use `Block.create_var` to create a variable.
-
-    >>> cur_program = Program()
-    >>> cur_block = cur_program.current_block()
-    >>> new_variable = cur_block.create_var(
-    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+    Most of a Variable's member variables can be setted to be None. It mean 
+    it is not available or will be specified later.

    Args:
-        block(Block): The associated block. It will be passed by
-            `Block.create_var` automatically.
+        block(Block): The block that the variable belongs to.
        type(core.VarDesc.VarType): Variable type. Please reference the
            framework.proto for details.
-        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+        name(str|None): The name of the variable. If setted None, it will be
+            generated automatically. Default: None
+        shape(tuple|list|None): The shape of the variable. -1 means the batch size.
            Some kinds of variable do not contain shape, just set it to None.
-        dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
-        lod_level(int): The level of lod tensor. 0 means it is not a time
+            Default: None
+        dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable.
+            Default: None
+        lod_level (int|None): The level of lod tensor. 0 means it is not a time
            series data.
-        capacity(int): The capacity of Channel variable. Ignored
-            for other types.
-        persistable(bool): True if the variable should be saved as check point.
-            Defaults to False.
-        stop_gradient(bool): True if the variable will stop to calculate
-            gradients when backward. Defaults to False.
+            Default: None
+        capacity (int|None): The capacity of Channel variable. Ignored for other
+            types. Default: None
+        persistable (bool|None): True if the variable is persistable. A persistable
+            variable will not be deleted after an iteration ending. Defaults: None.
+        error_clip (BaseErrorClipAttr|None): The error clip attributes of the
+            corresponding gradient variable. Default: None
+        stop_gradient (bool): True if the variable will stop to calculate its
+            gradients when backward. Default: False.
+        is_data (bool): True if the variable is an input data. Default: False
+
+    Notes:
+        The constructor of Variable should not be invoked directly. Please
+        use `Block.create_var` to create a variable.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            new_variable = cur_block.create_var(name="X",
+                                                shape=[-1, 23, 48],
+                                                dtype='float32')
    """

    def __init__(self,
@@ -253,13 +270,14 @@ class Variable(object):
        Get debug string.

        Args:
-            throw_on_error(bool): True if raise an exception when self is not
-                intialized.
+            throw_on_error(bool): True if raise an exception when self is
+                not initialized.
            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False;

+        Returns:
+            str: The debug string.
        """
        assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                               bool)
@@ -276,6 +294,15 @@ class Variable(object):
    __repr__ = __str__

    def set_desc(self, input):
+        """
+        Set the variable description.
+
+        Args:
+            input(core.VarDesc): The new VarDesc.
+
+        Returns:
+            None
+        """
        self.desc = input

    @property
@@ -312,6 +339,15 @@ class Variable(object):
        return self.desc.type()

    def set_error_clip(self, error_clip):
+        """
+        Set the error_clip.
+
+        Args:
+            error_clip(BaseErrorClipAttr) : The new error_clip.
+
+        Returns:
+            None
+        """
        self.error_clip = error_clip


@@ -319,8 +355,8 @@ def get_all_op_protos():
    """
    Get all registered op proto from PaddlePaddle C++ end.

-    Returns(list): list of OpProto
-
+    Returns:
+       list: list of OpProto.
    """
    protostrs = core.get_all_op_protos()
    ret_values = []
@@ -373,9 +409,45 @@ class OpProtoHolder(object):

 class Operator(object):
    """
-    Python Operator class. The operator represents the build in instructions in a
-    Block. Users can use the build in instructions to describe their neural
-    network.
+    In Fluid, all the operation are represented by Operator, and Operator
+    is regarded as a build in an instruction of a Block. Users can use the
+    build in instructions to describe their neural network.
+
+    Args:
+        block(Block): The block has the current operator.
+        desc(core.OpDesc): The protobuf description of Operator.
+        type(str): The type of operator. Default None.
+        inputs(dict): The input of this Operator. it is a dictionary, for every
+            element, key is the input parameter name, and value is a list of
+            variables. Default None.
+        outputs(dict): The output of this Operator. it is a dictionary, for
+            every element, key is the input parameter name, and value is a list
+            of variables. Default None.
+        attrs(dict): The attributes of this Operator. it is a dictionary, for
+            every element, key is attribute name, and value is the attribute value.
+            The attribute type should be as same as the type registered in C++ side.
+            Default None.
+
+    Returns:
+        Operator: The initialized Operator.
+
+    Raises:
+        ValueError: If the passed input, output and attrs doesn't match the
+            initializing Operator's that registered in C++ side.
+
+    Notes:
+        The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            # var1 += var2 + var3
+            cur_block.append_op(type="sum",
+                                inputs={"X": [var1, var2, var3]},
+                                outputs={"Out": [var1]})
    """
    OP_WITHOUT_KERNEL_SET = {
        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
@@ -392,31 +464,7 @@ class Operator(object):
                 inputs=None,
                 outputs=None,
                 attrs=None):
-        """
-        Constructor.

-        Notes: The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block.prepend_op instead.
-
-        >>> cur_program = Program()
-        >>> cur_block = cur_program.current_block()
-        >>> # var1 += var2 + var3
-        >>> cur_block.append_op(type="sum",
-        >>>                     inputs={"X": [var1, var2, var3]},
-        >>>                     outputs={"Out": [var1]})
-
-        Args:
-            block(Block): The block has the current operator.
-            desc(core.OpDesc): The protobuf description.
-            type(str): The type of operator.
-            inputs(dict): The input dictionary. Key is the input parameter name.
-                Value is a list of variables.
-            outputs(dict): The output dictionary which has the same format with
-                           inputs.
-            attrs(dict): The attributes dictionary. Key is attribute name. Value
-                is the attribute value. The attribute type should be as same as
-                the type registered in C++
-        """
        self.block = block
        self.desc = desc
        self.attrs = attrs
@@ -529,12 +577,14 @@ class Operator(object):

    def to_string(self, throw_on_error):
        """
-        To debug string.
+        Get debug string.
+
        Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+            throw_on_error(bool): Whether to raise exception if self is not
+                initialized.

-        Returns(str): The debug string.
+        Returns:
+            str: The debug string.

        """
        protostr = self.desc.serialize_to_string()
@@ -552,29 +602,45 @@ class Operator(object):

    def input(self, name):
        """
-        Get input arguments by the input parameter name
-        Args:
-            name(str): The input parameter name
+        Get the input arguments according to the input parameter name.

-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The input parameter name.

+        Returns:
+            list: return the list of argument names that associated with \
+                the specific parameter name.
        """
        return self.desc.input(name)

    def rename_input(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's input.
+            new_name(str): The new name of the Operator's input.
+
+        Returns:
+            None
+        """
        self.desc.rename_input(old_name, new_name)

    def rename_output(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's output.
+            new_name(str): The new name of the Operator's output.
+
+        Returns:
+            None
+        """
        self.desc.rename_output(old_name, new_name)

    @property
    def input_names(self):
-        """
-        Get all input parameter names
-        Returns(list): return a list of input parameter names
-
-        """
        return self.desc.input_names()

    @property
@@ -587,33 +653,23 @@ class Operator(object):

    def output(self, name):
        """
-        Get output arguments by the output parameter name
-        Args:
-            name(str): The output parameter name
+        Get output arguments by the output parameter name.

-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The output parameter name.

+        Returns:
+            list: return the list of argument names associated with \
+                the specific parameter name.
        """
        return self.desc.output(name)

    @property
    def output_names(self):
-        """
-        Get all output parameter names
-        Returns(list): return a list of output parameter names
-
-        """
        return self.desc.output_names()

    @property
    def idx(self):
-        """
-        Return the array index of current operator.
-        Returns(int): The array index in block.ops array
-        Raises:
-            ValueError: when the operator is not found.
-        """
        for i, op in enumerate(self.block.ops):
            if op == self:
                return i
@@ -622,27 +678,40 @@ class Operator(object):

    def has_attr(self, name):
        """
-        operator has the attribute with name or not.
+        Whether this Operator has the attribute with name or not.
+
        Args:
-            name(str): the attribute name
+            name(str): the attribute name.

-        Returns(bool): True if has this attribute.
+        Returns:
+            bool: True if has this attribute.

        """
        return self.desc.has_attr(name)

    def attr_type(self, name):
        """
-        Get the type of attribute by attribute name
-        Args:
-            name(str): the attribute name
+        Get the type of attribute by attribute's name.

-        Returns(core.AttrType): the attribute type
+        Args:
+            name(str): the attribute name.

+        Returns:
+            core.AttrType: the attribute type.
        """
        return self.desc.attr_type(name)

    def set_attr(self, name, val):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
        self.attrs[name] = val
        if isinstance(val, Block):
            self.desc.set_block_attr(name, val.desc)
@@ -654,40 +723,39 @@ class Operator(object):

    @property
    def attr_names(self):
-        """
-        Get all attribute names
-        Returns(list): The list of attribute name
-
-        """
        return self.desc.attr_names()

    def attr(self, name):
        """
-        Get attribute by name
+        Get the attribute by name.
+
        Args:
-            name(str): the attribute name
+            name(str): the attribute name.

-        Returns(bool|int|str|float|list): The attribute value. The return value
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
            can be any valid attribute type.
-
        """
        return self.desc.attr(name)

    def block_attr(self, name):
        """
-        Get the block attribute by name
-        Args:
-            name(str): the attribute name
+        Get the block attribute by name.

-        Returns(int): the block index
+        Args:
+            name(str): the attribute name.

+        Returns:
+            int: the block index.
        """
        return self.desc.block_attr(name)

    def all_attrs(self):
        """
-        Get the attribute dict
-        Returns(dict): The Operator's attribute dict
+        Get the attribute dict.
+
+        Returns:
+            dict: The Operator's attribute dict.
        """
        attr_names = self.attr_names
        attr_map = {}
@@ -700,6 +768,35 @@ class Operator(object):


 class Block(object):
+    """
+    In Fluid, a Program is consistence of multi-Block, and Block stores
+    VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name.
+    One block could have some child blocks, and child block's name scopes
+    should inherit the parent's so that OpDesc in child block can reference
+    a VarDesc that is stored in the parent block.
+    Please reference the framework.proto for details.
+
+    Args:
+        program(Program): The Program that the Block belongs to.
+        idx(int): The block's id in the Program.
+
+    Notes:
+        The constructor of Block should not be invoked directly. Please
+        use `Program.create_block()` to create a block.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            var = cur_block.create_var(name="X",
+                                       shape=[-1, 23, 48],
+                                       dtype='float32')
+            cur_block.append_op(type="abs",
+                                inputs={"X": [var]},
+                                outputs={"Out": [var]})
+    """
+
    def __init__(self, program, idx):
        self.desc = program.desc.block(idx)
        self.vars = collections.OrderedDict()  # var_name --> var
@@ -712,15 +809,17 @@ class Block(object):

    def to_string(self, throw_on_error, with_details=False):
        """
-        To debug string.
+        Get debug string.
+
        Args:
            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+                when throw_on_error is True.
            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False.

+        Returns:
+            str: The debug string.
        """
        assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                               bool)
@@ -752,6 +851,15 @@ class Block(object):
        return self.desc.get_forward_block_idx()

    def set_forward_block_idx(self, idx):
+        """
+        Set the forward block Idx.
+
+        Args:
+            idx(int): the block index.
+
+        Returns:
+            None
+        """
        self.desc.set_forward_block_idx(idx)

    @property
@@ -759,6 +867,19 @@ class Block(object):
        return self.desc.id

    def var(self, name):
+        """
+        Get a Variable by name from this block.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: The If input's type is not str, or this block
+                doesn't have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
        if not isinstance(name, basestring):
            raise TypeError(
                "var require string as parameter, but get %s instead." %
@@ -769,6 +890,19 @@ class Block(object):
        return v

    def var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
        frontier = list()
        visited = set()

@@ -815,6 +949,18 @@ class Block(object):
    def rename_var(self, name, new_name):
        """
        Rename variable in vars and ops' inputs and outputs
+
+        Args:
+            name(str): the name that need to be renamed.
+            new_name(str): the name that need to rename to.
+
+        Raises:
+            ValueError: If this block doesn't have this the giving name,
+                or the type of the var with the giving name is not Parameter
+                or Variable.
+
+        Returns:
+            Variable: the Variable with the giving name.
        """
        if not self.has_var(name):
            raise ValueError("var %s is not in current block" % name)
@@ -878,12 +1024,27 @@ class Block(object):
        return param

    def append_op(self, *args, **kwargs):
+        """
+        Appends a new Operator according to the giving arguments.
+
+        Returns:
+            Operator: the append Operator.
+        """
        op_desc = self.desc.append_op()
        op = Operator(block=self, desc=op_desc, *args, **kwargs)
        self.ops.append(op)
        return op

    def insert_op(self, index, *args, **kwargs):
+        """
+        Insert a Operator according to the giving arguments.
+
+        Args:
+            index(int): the place that the operator to insert.
+
+        Returns:
+            Operator: the insert Operator.
+        """
        self.sync_with_cpp()
        op_desc = self.desc.insert_op(index)
        op = Operator(block=self, desc=op_desc, *args, **kwargs)
@@ -891,11 +1052,30 @@ class Block(object):
        return op

    def remove_op(self, index):
+        """
+        Remove the specific position operator.
+
+        Args:
+            index(int): the position that the operator to insert.
+
+        Returns:
+            None
+        """
        self.sync_with_cpp()
        self.desc.remove_op(index, index + 1)
        del self.ops[index]

    def slice_ops(self, start, end):
+        """
+        Return the Operator between start and end.
+
+        Args:
+            start(int): the start position.
+            end(int): the end position.
+
+        Returns:
+            list: the Operators between start and end.
+        """
        return self.ops[start:end]

    def prepend_op(self, *args, **kwargs):
@@ -906,9 +1086,8 @@ class Block(object):

    def sync_with_cpp(self):
        """
-        Sync from the desc on the c++ end.
-
-        This method is used to synchronize the c++ desc instance generated by backward.
+        Sync from the desc on the c++ end. This method is used to synchronize
+        the c++ desc instance generated by backward.
        """
        # sync variables from cpp
        for var in self.desc.all_vars():
@@ -973,9 +1152,14 @@ class Block(object):

    def copy_param_info_from(self, other):
        """
-        Copy the information of parameters from the other block
+        Copy the information of parameters from the other block.
+
        Args:
-            other(Block): the other block
+            other(Block): the other block.
+
+        Raises:
+            ValueError: If type of input is not Block, or the `other` and this
+                block is not in the same topology.

        Returns:
            None
@@ -1007,11 +1191,12 @@ class Block(object):
    def clone_variable(self, var):
        """
        Clone a variable into current block.
+
        Args:
            var: the variable to be cloned.

        Returns:
-            The new  variable cloned from 'var' in current block.
+            Variable: the new  variable cloned from 'var' in current block.
        """
        assert isinstance(var, Variable)
        ret_var = None
@@ -1051,23 +1236,18 @@ class Program(object):
    Notes: we have default_startup_program and default_main_program
    by default, a pair of them will shared the parameters.
    The default_startup_program only run once to initialize parameters,
-    default_main_program run in every minibatch and adjust the weights.
-
-    Args:
-        None
+    default_main_program run in every mini batch and adjust the weights.

    Returns:
-        Python Program
+        A empty program.

    Examples:
-       .. code-block:: python
-
-         main_program = Program()
-         startup_program = Program()
-         with fluid.program_guard(main_program=main_program, startup_program=startup_program):
-            fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
-            fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
-            fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+        >>>     fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+        >>>     fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+        >>>     fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")

    """

@@ -1081,6 +1261,19 @@ class Program(object):

    @property
    def op_role(self):
+        """
+        The operator role. In a enum {Forward, Backward, Optimize}.
+
+        Notes: this is a low level API. It is used only for ParallelExecutor to
+        duplicate or schedule operator to devices.
+
+        For example, the forward operator should be executed on every device.
+        The backward operator should be executed on every device and the
+        parameter gradient of backward (use :code:`op_role_var` to get this
+        variable) operator should be merged to one device. The optimization
+        operators should be executed on only one device and broadcast the
+        optimization result, i.e., the new parameter, to every other device.
+        """
        return self._current_role

    @op_role.setter
@@ -1089,6 +1282,13 @@ class Program(object):

    @property
    def op_role_var(self):
+        """
+        The auxiliary variables for :code:`op_role` property.
+
+        See Also: :code:`Program.op_role`'s documentation for details.
+
+        Notes: This is a very low-level API. Users should not use it directly.
+        """
        return self._op_role_var

    @op_role_var.setter
@@ -1097,6 +1297,21 @@ class Program(object):

    @contextlib.contextmanager
    def optimized_guard(self, var):
+        """
+        A with guard to set :code:`Optimization` :code:`OpRole` and
+        :code:`OpRoleVar` automatically.
+
+        Notes: This is a very low level API. Users should not use it directly.
+
+        Args:
+            var(Variable|str): The variable (name) to be optimized.
+
+        Examples:
+
+            >>> p, g = backward(...)
+            >>> with program.optimized_guard(p):
+            >>>     p = p - 0.001 * g
+        """
        OpRole = core.op_proto_and_checker_maker.OpRole
        self._current_role = OpRole.Optimize
        self._op_role_var = [var.name if isinstance(var, Variable) else var]
@@ -1105,18 +1320,35 @@ class Program(object):
        self._current_role = OpRole.Forward

    def __str__(self):
+        """
+        Get the protobuf debug string of this Program.
+
+        Returns:
+            (str): The protobuf debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set.
+        """
        return self.to_string(True)

    def to_string(self, throw_on_error, with_details=False):
        """
        To debug string.
+
        Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+            throw_on_error(bool): raise Value error when any of required fields
+                is not set.

-        Returns(str): The debug string.
+            with_details(bool): True if more details about variables and
+                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
+                to print.
+
+        Returns
+            (str): The debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set and throw_on_error is
+                True.

        """
        assert isinstance(throw_on_error, bool) and isinstance(with_details,
@@ -1132,25 +1364,89 @@ class Program(object):
        return res_str

    def get_desc(self):
+        """
+        Get the C++ side of `ProgramDesc` object pointer. The C++ object is
+        exposed by :code:`pybind`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly.
+        """
        return self.desc

    def clone(self, for_test=False):
-        """Clone the Program object
-        Args:
-           for_test(bool): indicate whether clone for test.
+        """
+        Create a new, duplicated program.
+

-        Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.
+        Some operators, e.g., :code:`batch_norm`, behave differently between
+        training and testing. They have an attribute, :code:`is_test`, to
+        control this behaviour. This method will change the :code:`is_test`
+        attribute of them to :code:`True` when :code:`for_test=True`.
+
+        * Set for_test to False when we want to clone the program for training.
+        * Set for_test to True when we want to clone the program for testing.
+
+        Notes: This API DOES NOT prune any operator. Use
+        :code:`clone(for_test=True)` before backward and optimization please.

        Args:
-            for_test(bool): Some operators, such as batch_norm and drop_out ops,
-                behave differently in training and testing. If for_test is True,
-                the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.
+            for_test(bool): True if change the :code:`is_test` attribute of
+                operators to :code:`True`.

        Returns:
-            Program: The cloned Program object.
-
+            Program: The new, duplicated Program object.
+
+        Examples:
+
+            1. To clone a test program, the sample code is:
+
+            >>> import paddle.fluid as fluid
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>
+            >>> test_program = train_program.clone(for_test=True)
+            >>>
+            >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     sgd.minimize(loss)
+
+            2. The :code:`clone` method can be avoid if you create program for
+            training and program for testing individually.
+
+            >>> import paddle.fluid as fluid
+            >>>
+            >>> def network(is_test):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>     return loss
+            >>>
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> test_program = fluid.Program()
+            >>>
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=False)
+            >>>         sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>>         sgd.minimize(loss)
+            >>>
+            >>> # the test startup program is not used.
+            >>> with fluid.program_guard(test_program, fluid.Program()):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=True)
+
+            The two code snippets above will generate same programs.
        """
        if for_test:
            p = self.inference_optimize()
@@ -1165,6 +1461,21 @@ class Program(object):
        return p

    def prune(self, targets):
+        """
+        Prune operators and variables which are not needed to generate
+        :code:`targets`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly. This API is in flux and not stable.
+
+        Args:
+            targets(list|Variable|Operator): A list of variables or operators
+                need to be pruned
+
+        Returns:
+            Program:  A new, pruned program.
+
+        """
        if not isinstance(targets, list):
            targets = [targets]
        targets_idx = []
@@ -1199,6 +1510,17 @@ class Program(object):
        return res

    def inference_optimize(self):
+        """
+        This method will create a new program and change the :code:`is_test`
+        attribute of operators to :code:`True`. All the :code:`Parameter`
+        information will be lost.
+
+        Notes: This API is a very low level API. Use
+        :code:`Program.clone(for_test=True)` instead.
+
+        Returns:
+            Program: The new program.
+        """
        # this is an alternative implement before
        # core.inference_optimize being fixed.
        res = Program()
@@ -1215,6 +1537,18 @@ class Program(object):

    @staticmethod
    def parse_from_string(binary_str):
+        """
+        Deserialize a program desc from protobuf binary string.
+
+        Notes: All information about parameters will be lost after serialization
+        and deserialization.
+
+        Args:
+            binary_str(str): The binary prootbuf string.
+
+        Returns:
+            Program: A deserialized program desc.
+        """
        p = Program()
        p.desc = core.ProgramDesc(binary_str)
        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
@@ -1223,10 +1557,19 @@ class Program(object):

    @property
    def random_seed(self):
+        """
+        The default random seed for random operators in Program. Zero means get
+        the random seed from random device.
+
+        Notes: It must be set before the operators have been added.
+        """
        return self._seed

    @property
    def num_blocks(self):
+        """
+        The number of blocks in this program.
+        """
        return self.desc.num_blocks()

    @random_seed.setter
@@ -1239,15 +1582,40 @@ class Program(object):
        return str(self)

    def global_block(self):
+        """
+        Get the first block of this program.
+        """
        return self.blocks[0]

    def block(self, index):
+        """
+        Get the :code:`index` block of this program
+        Args:
+            index(int): The index of block to get
+
+        Returns:
+            Block: The :code:`index` block
+        """
        return self.blocks[index]

    def current_block(self):
+        """
+        Get the current block. The :code:`current` block is the block to append
+        operators.
+        """
        return self.blocks[self.current_block_idx]

    def create_block(self, parent_idx=None):
+        """
+        Create a new block with the :code:`parent_idx` and change the current block
+        to new block.
+
+        Args:
+            parent_idx(int): The parent block index.
+
+        Returns:
+            Block: The new block.
+        """
        new_block_idx = len(self.blocks)
        parent = self.current_block() if parent_idx is None else self.block(
            parent_idx)
@@ -1257,9 +1625,24 @@ class Program(object):
        return self.current_block()

    def rollback(self):
+        """
+        Exit a code block, i.e., roll back to the parent block.
+        Returns:
+            None
+        """
        self.current_block_idx = self.current_block().parent_idx

    def sync_with_cpp(self):
+        """
+        Synchronize Python instance to its binding C++ object instance.
+        If the program is modified in C++ space, this method should be invoked.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
+        Returns:
+            None
+        """
        for block_idx in range(len(self.blocks), self.desc.num_blocks()):
            self.blocks.append(Block(self, block_idx))
        for block in self.blocks:
@@ -1269,6 +1652,9 @@ class Program(object):
        """
        Copy the information of parameters from other program.

+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
        Args:
            other(Program): Other program

@@ -1288,6 +1674,9 @@ class Program(object):
        """
        Copy the information of data variables from other program.

+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
        Args:
            other(Program): Other program

@@ -1306,12 +1695,41 @@ class Program(object):
                self.global_block().var(var.name).is_data = True

    def list_vars(self):
+        """
+        Get all variables from this Program. A iterable object is returned.
+
+        Returns:
+            iterable: The generator will yield every variable in this program.
+        """
        for each_block in self.blocks:
            for each_var in each_block.vars.itervalues():
                yield each_var


 class Parameter(Variable):
+    """
+    Parameter is derived from Variable. A parameter is a persistable 
+    Variable, and will be updated by optimizers after each iteration.
+    The training of a neural network is essentially the updating of 
+    its parameters.
+
+    Relative to a general Variable, a Parameter has several its own
+    member variables:
+
+    Args:
+        trainable(bool): True if the parameter need to be updated after
+            iterations.
+        optimize_attr(map): Parameter attributes related with optimizing.
+            Currently, it only contains 'learning_rate'.
+            Default: {'learning_rate': 1.0}
+        regularizer(WeightDecayRegularizer): The Regularizer which will
+            be applied on the parameter. Default: None
+        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+            which will be applied on the parameter. Default: None
+        do_model_average(bool): True if the model average strategy will
+            be applied on this parameter.
+    """
+
    def __init__(self, block, shape, dtype, **kwargs):
        if shape is None or dtype is None:
            raise ValueError("Parameter must set shape and dtype")
@@ -1374,8 +1792,15 @@ _startup_program_ = Program()

 def default_startup_program():
    """
-    Get default startup program. In startup program, Paddle will initialize
-    parameters, initialize nccl handle, etc.
+    Get default/global startup program.
+
+    The layer function in :code:`fluid.layers` will create parameters, readers,
+    NCCL handles as global variables. The :code:`startup_program` will
+    initialize them by the operators in startup program. The layer function will
+    append these initialization operators into startup program.
+
+    This method will return the :code:`default` or the :code:`current` startup
+    program. Users can use :code:`fluid.program_guard` to switch program.

    Returns:
        Program: startup program
@@ -1385,7 +1810,15 @@ def default_startup_program():

 def default_main_program():
    """
-    Get default main program. The main program is used for training or testing.
+    Get default/global main program. The main program is used for training or
+    testing.
+
+    All layer function in :code:`fluid.layers` will append operators and
+    variables to the :code:`default_main_program`.
+
+    The :code:`default_main_program` is the default program in a lot of APIs.
+    For example, the :code:`Executor.run()` will execute the
+    :code:`default_main_program` when the program is not specified.

    Returns:
        Program: main program
@@ -1427,20 +1860,34 @@ def switch_startup_program(program):
 @contextlib.contextmanager
 def program_guard(main_program, startup_program=None):
    """
-    Switch program with `with` statement
+    Change the global main program and startup program with `with` statement.
+    Layer functions in the Python `with` block will append operators and
+    variables to the new main programs.

    Examples:
-        >>> with program_guard(Program()):
-        >>>   data = fluid.layers.data(...)
-        >>>   hidden = fluid.layers.fc(...)
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program, startup_program):
+        >>>     data = fluid.layers.data(...)
+        >>>     hidden = fluid.layers.fc(...)
+
+    Notes: The temporary :code:`Program` can be used if the user does not need
+    to construct either of startup program or main program.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> # does not care about startup program. Just pass a temporary value.
+        >>> with fluid.program_guard(main_program, fluid.Program()):
+        >>>     data = ...

    Args:
-        main_program(Program): New main program inside `with` statement
+        main_program(Program): New main program inside `with` statement.
        startup_program(Program): New startup program inside `with` statement.
            None means do not change startup program.
-
-    Returns:
-        None
    """
    if not isinstance(main_program, Program):
        raise TypeError("main_program should be Program")
@@ -1457,7 +1904,8 @@ def program_guard(main_program, startup_program=None):

 def get_var(name, program=None):
    """
-    Get a variable by name from the global block of a program
+    Get a variable by name from the global block of a program.
+    
    Args:
        name(str): name of the variable
        program(Program|None): program object.

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -30,20 +30,42 @@ __all__ = [


 def is_parameter(var):
-    """Check whether the variable is a Parameter.
-
-    This function checks whether the input variable is a Parameter.
+    """
+    Check whether the given variable is an instance of Parameter.

    Args:
-        var : The input variable.
+        var(Variable): The variable to be checked.

    Returns:
-        boolean result whether the variable is a Parameter.
+        bool: True if the given `var` is an instance of Parameter,
+        False if not.
+
+    Examples:
+        .. code-block:: python
+
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_parameter(param)
    """
    return isinstance(var, Parameter)


 def is_persistable(var):
+    """
+    Check whether the given variable is persistable.
+
+    Args:
+        var(Variable): The variable to be checked.
+
+    Returns:
+        bool: True if the given `var` is persistable
+        False if not.
+
+    Examples:
+        .. code-block:: python
+
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_persistable(param)
+    """
    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
            var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
        return False
@@ -68,20 +90,69 @@ def save_vars(executor,
              predicate=None,
              filename=None):
    """
-    Save variables to directory by executor.
+    Save variables to the given directory by executor.
+
+    There are two ways to specify variables to be saved: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be saved. The first way has a higher priority. In other words, if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.

-    :param executor: executor that save variable
-    :param dirname: directory path
-    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default default_main_program.
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be saved.
-    :param vars: variables need to be saved. If vars is specified, program & predicate
-    will be ignored
-    :param filename: The name of a single file that all vars are saved to.
-        If it is None, save variables to separate files.
+    The `dirname` are used to specify the folder where to save variables. 
+    If you prefer to save variables in separate files in the folder `dirname`, 
+    set `filename` None; if you prefer to save all variables in a single file, 
+    use `filename` to specify it.

-    :return: None
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be saved. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to save. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be saved. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which to save all variables. If you prefer to save 
+                            variables separately, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
+
+            prog = fluid.default_main_program()
+            fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be saved.
+            # And variables are going to be saved separately.
+
+
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be saved. And they are going to be
+            # saved in the same file named 'var_file' in the path "./my_paddle_model".
    """
    if vars is None:
        if main_program is None:
@@ -129,7 +200,42 @@ def save_vars(executor,

 def save_params(executor, dirname, main_program=None, filename=None):
    """
-    Save all parameters to directory with executor.
+    This function filters out all parameters from the give `main_program`
+    and then save them to the folder `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the saving folder. If you would like to 
+    save parameters in separate files, set `filename` None; if you would 
+    like to save all parameters in a single file, use `filename` to specify 
+    the file name.
+
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    saved. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to save all parameters. If you prefer 
+                            to save parameters in differnet files, set it 
+                            to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_params(executor=exe, dirname=param_path, 
+                                 main_program=None)
    """
    save_vars(
        executor,
@@ -142,7 +248,37 @@ def save_params(executor, dirname, main_program=None, filename=None):

 def save_persistables(executor, dirname, main_program=None, filename=None):
    """
-    Save all persistables to directory with executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then saves these variables to the folder `dirname` 
+    or file `filename`.
+
+    The `dirname` is used to specify the folder where persistable variables 
+    are going to be saved. If you would like to save variables in separate 
+    files, set `filename` None; if you would like to save all variables in a 
+    single file, use `filename` to specify the file name.
+
+    Args:
+        executor(Executor): The executor to run for saving persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be saved. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to saved all variables. If you prefer to 
+                            save variables in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
    """
    save_vars(
        executor,
@@ -160,20 +296,69 @@ def load_vars(executor,
              predicate=None,
              filename=None):
    """
-    Load variables from directory by executor.
+    Load variables from the given directory by executor.
+
+    There are two ways to specify variables to be loaded: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be loaded. The first way has a higher priority. In other words if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
+
+    The `dirname` are used to specify the folder where to load variables. 
+    If variables were saved in separate files in the folder `dirname`, 
+    set `filename` None; if all variables were saved in a single file, 
+    use `filename` to specify it.

-    :param executor: executor that load variable
-    :param dirname: directory path
-    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default default_main_program().
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be loaded.
-    :param vars: variables need to be loaded. If vars is specified, program &
-    predicate will be ignored
-    :param filename: The name of the single file that all vars are loaded from.
-        If it is None, load variables from separate files.
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be loaded. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to load. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be loaded. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which saved all required variables. If variables 
+                            were saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res

-    :return: None
+            prog = fluid.default_main_program()
+            fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be loaded.
+            # And all the variables are supposed to have been saved in differnet files.
+
+
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # been saved in the same file named 'var_file' in the path "./my_paddle_model".
    """
    if vars is None:
        if main_program is None:
@@ -221,7 +406,42 @@ def load_vars(executor,

 def load_params(executor, dirname, main_program=None, filename=None):
    """
-    load all parameters from directory by executor.
+    This function filters out all parameters from the give `main_program`
+    and then trys to load these parameters from the folder `dirname` or
+    the file `filename`.
+
+    Use the `dirname` to specify the folder where parameters were saved. If 
+    parameters were saved in separate files in the folder `dirname`, set 
+    `filename` None; if all parameters were saved in a single file, use 
+    `filename` to specify the file name.
+
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead. 
+
+    Args:
+        executor(Executor): The executor to run for loading parameters.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    loaded. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all parameters. If parameters 
+                            were saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_params(executor=exe, dirname=param_path, 
+                                main_program=None)
    """
    load_vars(
        executor,
@@ -233,7 +453,37 @@ def load_params(executor, dirname, main_program=None, filename=None):

 def load_persistables(executor, dirname, main_program=None, filename=None):
    """
-    load all persistables from directory by executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then trys to load these variables from the folder 
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were 
+    saved. If variables were saved in separate files, set `filename` None; 
+    if all variables were saved in a single file, use `filename` to specify 
+    the file name.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be loaded. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all variables. If variables were 
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
    """
    load_vars(
        executor,
@@ -306,22 +556,48 @@ def save_inference_model(dirname,
                         model_filename=None,
                         params_filename=None):
    """
-    Build a model especially for inference,
-    and save it to directory by the executor.
+    Prune the given `main_program` to build a new program especially for inference,
+    and then save it and all related parameters to given `dirname` by the `executor`.
+
+    Args:
+        dirname(str): The directory path to save the inference model.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference 
+                                     results.
+        executor(Executor): The executor that saves the inference model.
+        main_program(Program|None): The original program, which will be pruned to 
+                                    build the inference model. If is setted None, 
+                                    the default main program will be used.
+                                    Default: None.
+        model_filename(str|None): The name of file to save the inference program 
+                                  itself. If is setted None, a default filename 
+                                  `__model__` will be used.
+        params_filename(str|None): The name of file to save all related parameters. 
+                                   If it is setted None, parameters will be saved 
+                                   in separate files .

-    :param dirname: directory path
-    :param feeded_var_names: Names of variables that need to be feeded data during inference
-    :param target_vars: Variables from which we can get inference results.
-    :param executor: executor that save inference model
-    :param main_program: original program, which will be pruned to build the inference model.
-            Default default_main_program().
-    :param model_filename: The name of file to save inference program.
-        If not specified, default filename `__model__` will be used.
-    :param params_filename: The name of file to save parameters.
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `feed_var_names` is not a list of basestring.
+        ValueError: If `target_vars` is not a list of Variable.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
+                         target_vars=[predict_var], executor=exe)
+
+            # In this exsample, the function will prune the default main program 
+            # to make it suitable for infering the `predict_var`. The pruned 
+            # inference program is going to be saved in the "./infer_model/__model__" 
+            # and parameters are going to be saved in separate files under folder
+            # "./infer_model". 

-    :return: None
    """
    if isinstance(feeded_var_names, basestring):
        feeded_var_names = [feeded_var_names]
@@ -382,18 +658,49 @@ def load_inference_model(dirname,
    """
    Load inference model from a directory

-    :param dirname: directory path
-    :param executor: executor that load inference model
-    :param model_filename: The name of file to load inference program.
-        If not specified, default filename `__model__` will be used.
-    :param params_filename: The name of file to load parameters.
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        model_filename(str|None): The name of file to load inference program.
+                                  If it is None, the default filename 
+                                  '__model__' will be used.
+                                  Default: None
+        params_filename(str|None): The name of file to load all parameters.
+                                   It is only used for the case that all 
+                                   parameters were saved in a single binary 
+                                   file. If parameters were saved in separate 
+                                   files, set it as 'None'.
+
+    Returns:
+        tuple: The return of this function is a tuple with three elements:
+        (program, feed_target_names, fetch_targets). The `program` is a 
+        Program, it's the program for inference. The `feed_target_names` is 
+        a list of str, it contains Names of variables that need to feed 
+        data in the inference program. The `fetch_targets` is a list of 
+        Variable. It contains variables from which we can get inference 
+        results.
+
+    Raises:
+        ValueError: If `dirname` is not a existing directory.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            [inference_program, feed_target_names, fetch_targets] = 
+                fluid.io.load_inference_model(dirname=path, executor=exe)
+            results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+            # In this exsample, the inference program was saved in the 
+            # "./infer_model/__model__" and parameters were saved in 
+            # separate files in ""./infer_model". 
+            # After getting inference program, feed target names and 
+            # fetch targets, we can use an Executor to run the inference 
+            # program to get the inference result.

-    :return: [program, feed_target_names, fetch_targets]
-             program: program especially for inference.
-             feed_target_names: Names of variables that need to feed data
-             fetch_targets: Variables from which we can get inference results.
    """
    if not os.path.isdir(dirname):
        raise ValueError("There is no directory named '%s'", dirname)
@@ -424,12 +731,25 @@ def load_inference_model(dirname,

 def get_parameter_value(para, executor):
    """
-    Get the LoDTensor for the parameter
+    Get the LoDTensor value of the given parameter.
+
+    Args:
+        para(Parameter): The parameter to get value from.
+        executor(Executor): The executor to run for retrieving the value.
+
+    Returns:
+        numpy.array: The given parameter's values.
+
+    Raises:
+        AssertionError: If the `para` is not an instance of Parameter.

-    :param executor: executor for retrieving the value
-    :param para: the given parameter
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param = fluid.default_main_program().global_block().var('fc.w')
+            p = fluid.io.get_parameter_value(param, exe)

-    :return: the LoDTensor for the parameter
    """
    assert is_parameter(para)

@@ -441,14 +761,30 @@ def get_parameter_value(para, executor):

 def get_parameter_value_by_name(name, executor, program=None):
    """
-    Get the LoDTensor for paramter with the given name
+    Get the LoDTensor value of a certain parameter by its name.
+
+    Args:
+        name(str): The parameter's name.
+        executor(Executor): The executor to run for retrieving the value.
+        program(Program | None): The program where to find the parameter.
+                               If it's set to be None, the function will
+                               try to find the parameter in the default
+                               main program.

-    :param executor: executor for retrieving the value
-    :param name: the name of the parameter
-    :param program: the program where the variable is found
-            Default default_main_program().
+    Returns:
+        numpy.array: The parameter's values.

-    :return: the LoDTensor for the variable
+    Raises:
+        TypeError: If given `name` is not an instance of basestring.
+        TypeError: If the parameter with the given name doesn't exist.
+        AssertionError: If there is a varibale named `name` in the
+                        given program but it is not a Parameter.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            p = fluid.io.get_parameter_value('fc.w', exe)
    """
    if program is None:
        program = default_main_program()
@@ -470,16 +806,58 @@ def save_checkpoint(executor,
                    main_program=None,
                    max_num_checkpoints=3):
    """
-    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
-    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
-    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
-    The interval between two saved checkpoints must greater than save_interval_secs.
+    This function filters out all checkpoint variables from the give
+    main_program and then saves these variables to the `checkpoint_dir` 
+    directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the 
+    `checkpoint_dir`. To avoid them taking too much disk space, the 
+    `max_num_checkpoints` are introduced to limit the total number of 
+    checkpoints. If the number of existing checkpints is greater than 
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".

-    :param executor executor for save the value
-    :param checkpoint_dir the checkpoint directory 
-    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
-    :param main_program   will save all variables in program 
-    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
+    Args:
+        executor(Executor): The executor to run for save checkpoint.
+        checkpoint_dir(str): The folder where to save checkpoints.
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
+            and 'step_id'.
+            Defaut: None
+        main_program(Program|None): The program whose checkpoint variables will
+            be saved. If it is None, the default main program will be used.
+        max_num_checkpoints(int): The max number of total number of existing 
+            checkpoints.
+            Default: 3
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            fluid.io.save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3)
    """
    if checkpoint_dir is None:
        raise ValueError("'checkpoint_dir' should not be None")
@@ -503,13 +881,50 @@ def save_checkpoint(executor,

 def load_checkpoint(executor, checkpoint_dir, serial, main_program):
    """
-    Load checkpoint from a directory by executor,
-    it will find  the most recent saved checkpoint file and load it auto.
+    This function filters out all checkpoint variables from the give
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the 
+    `checkpoint_dir` (each checkpoint has its own sub folder), use 
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.

-    :param executor executor for load the value
-    :param checkpoint_dir  the checkpoint directory 
-    :param serial the serial folder in checkpoint directory will be load
-    :param main_program  will load all variables in program 
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `serial` is None or `serial` is less than 0.
+        ValueError: If `main_program` is None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
    """

    if checkpoint_dir is None:
@@ -528,10 +943,10 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
    """
    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.

-    :param checkpoint_dir
-    :param delete_dir
+    : param checkpoint_dir
+    : param delete_dir
    """

    if checkpoint_dir is None:
@@ -547,13 +962,40 @@ def load_persist_vars_without_grad(executor,
                                   program,
                                   has_model_dir=False):
    """
-    load_persist_vars_without_grad will load variables from a directory by an executor,
-    the variable named end with "@GRAD" will not be loaded.
+    This function filters out all checkpoint variables from the give
+    program and then trys to load these variables from the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".

-    :param executor  executor for load the value
-    :param dirname the checkpoint directory 
-    :param program   will load all variables in program 
-    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+
+            # In this example, `load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
    """

    if has_model_dir:
@@ -569,12 +1011,38 @@ def load_persist_vars_without_grad(executor,

 def save_persist_vars_without_grad(executor, dirname, program):
    """
-    save_persist_vars_without_grad  will save variables to a directory by an executor,
-    the variable named end with "@GRAD" will not be saved.
+    This function filters out all checkpoint variables from the give
+    program and then save these variables to a sub-folder '__model__' of 
+    the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)

-    :param executor  executor for load the value
-    :param dirname the checkpoint directory 
-    :param program   will load all variables in program
+            # In this example, `save_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then saves these variables to the folder 
+            # "./my_paddle_model/__model__".
    """
    cur_dir = _get_model_dir(dirname)
    save_vars(
@@ -620,7 +1088,7 @@ def _is_checkpoint_var(var):
    the checkpoint will not save or load all the variables.
    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.

-    :param var
+    : param var
    """
    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
@@ -701,7 +1169,7 @@ def _write_success(dirname):
    """
    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.

-    :param dirname
+    : param dirname
    """
    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
    with open(success_file, 'a') as f:
@@ -713,7 +1181,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
    """
    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory

-    :param checkpoint_dir
+    : param checkpoint_dir
    """
    if not checkpoint_dir:
        return -1

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -185,12 +185,14 @@ def Print(input,
    Returns:
        Variable: Output tensor, same data with input tensor.

+
    Examples:
+
        .. code-block:: python

-        value = some_layer(...)
-        Print(value, summarize=10,
-              message="The content of some_layer: ")
+           value = some_layer(...)
+           Print(value, summarize=10,
+               message="The content of some_layer: ")
    '''
    helper = LayerHelper('print', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):


 class ConditionalBlock(object):
+    '''
+    **ConditionalBlock**
+
+    ConditionalBlock is an operator that bind a block to a specific condition,
+    if the condition matches, the corresponding block will be executed.
+
+    Args:
+        inputs (Variable): bool conditions.
+        is_scalar_condition (bool): whether the branch is controled by a scalar.
+        name(str): name of this ConditionalBlock.
+
+    Examples:
+        .. code-block:: python
+
+             cond = layers.less_than(x=label, y=limit)
+             true_image, false_image = layers.split_lod_tensor(
+                 input=image, mask=cond)
+             true_cond = layers.ConditionalBlock([true_image])
+
+             with true_cond.block():
+                 ...
+             with false_cond.block():
+                 ...
+    '''
+
    def __init__(self, inputs, is_scalar_condition=False, name=None):
        for each_input in inputs:
            if not isinstance(each_input, Variable):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2678,18 +2678,35 @@ def sequence_expand(x, y, ref_level=-1, name=None):

 def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
    '''
+    **beam search**
+
    This function implements the beam search algorithm.

+    Beam search is a classical algorithm for selecting candidate words
+    in a machine translation task.
+
+    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
+    for more details.
+
    Args:
-        pre_ids (Variable): ${pre_ids_comment}
-        ids (Variable): ${ids_comment}
-        scores (Variable): ${scores_comment}
-        beam_size (int): ${beam_size_comment}
-        end_id (int): ${end_id_comment}
-        level (int): ${level_comment}
+        pre_ids (Variable): ids in previous step.
+        ids (Variable): a LoDTensor of shape of [None,k]
+        scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
+        beam_size (int): beam size for beam search
+        end_id (int): the token id which indicates the end of a sequence
+        level (int): the level of LoDTensor

    Returns:
-        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
+        tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
+
+    Examples:
+        .. code-block:: python
+
+             # current_score is a Tensor of shape (num_batch_size, embed_size), which
+             # consists score of each candidate word.
+             topk_scores, topk_indices = pd.topk(current_score, k=50)
+             selected_ids, selected_scores = pd.beam_search(
+                 pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
    '''
    helper = LayerHelper('beam_search', **locals())
    score_type = scores.dtype

--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -19,33 +19,41 @@ __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']


 def create_lod_tensor(data, lod, place):
-    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+    """
+    Create a lod tensor from a numpy array, a list, or an existing lod tensor.

    Create a lod tensor by doing the following:
+
    1. Check that the length-based input lod is valid.
+
    2. Convert the length-based lod to a offset-based LoD.
-    3. Copy the data from a numpy array, a list or a existing lod tensor to 
+
+    3. Copy the data from a numpy array, a list or a existing lod tensor to
       CPU or GPU device (based on input place).
+
    4. Set the level of detail (LoD) using the offset-based LoD.
    
-    Use example:
-    Suppose we want LoDTensor to hold data for sequences of word, where each word is
-    represented by an integer. If we want to create a LoDTensor to represent two 
-    sentences, one of 2 words, and one of 3 words. 
+    Examples:

-    Then 'data' can be a numpy array of integers with shape (5, 1).
-    'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
-    This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
-    inside the function call.
+        Suppose we want LoDTensor to hold data for sequences of word, where each
+        word is represented by an integer. If we want to create a LoDTensor to
+        represent two  sentences, one of 2 words, and one of 3 words.

-    Please refer to 
-    github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
-    for more details regarding LoD.
+        Then :code:`data` can be a numpy array of integers with shape (5, 1).
+        :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based input lod [[2, 3]] will be converted to
+        offset-based lod [[0, 2, 5]] inside the function call.
+
+    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
+    regarding LoD.

    Args:
-        data: a numpy array or a LoDTensor or a list holding the data to be copied.
-        lod: a list of lists indicating the length-based LoD info specified by the user. 
-        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
+            list holding the data to be  copied.
+        lod(list): a list of lists indicating the length-based LoD info
+            specified by the user.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.

    Returns:
        A fluid LoDTensor object with tensor data and lod info.
@@ -77,31 +85,38 @@ def create_lod_tensor(data, lod, place):


 def create_random_int_lodtensor(lod, base_shape, place, low, high):
-    """Create a LoDTensor containing random integers.
+    """
+    Create a LoDTensor containing random integers.

-    This function is frequently used in the book examples. So we revised it based on 
-    the new create_lod_tensor API and put it here in the lod_tensor module to simplify 
-    the code. 
+    This function is frequently used in the book examples. So we revised it
+    based on the new create_lod_tensor API and put it here in the lod_tensor
+    module to simplify the code.

    The function does the following:
-    1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input 
-    and the shape of the basic element in 'base_shape'.
+
+    1. Calculate the overall shape of the LoDTensor based on the length-based
+       :code:`lod` input and the shape of the basic element in
+       :code:`base_shape`.
+
    2. Create a numpy array of this shape.
+
    3. Create the LoDTensor using create_lod_tensor API.

-    Suppose we want LoDTensor to hold data for sequences of word, where each word is
-    represented by an integer. If we want to create a LoDTensor to represent two 
-    sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input 
-    length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be 
-    [5, 1], holding 5 words for two sentences. 
+    Suppose we want LoDTensor to hold data for sequences of word, where each
+    word is represented by an integer. If we want to create a LoDTensor to
+    represent two sentences, one of 2 words, and one of 3 words. Then
+    'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
+    shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.

    Args:
-        data: a numpy array or a LoDTensor holding the data to be copied.
-        lod: a list of lists indicating the length-based LoD info specified by the user.
-        base_shape: the shape of the basic element to be held by the LoDTensor. 
-        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
-        low: the lower bound of the random integers.
-        high: the upper bound of the random integers.
+        lod(list): a list of lists indicating the length-based LoD info
+            specified by the user.
+        base_shape(list): the shape of the basic element to be held by the
+            LoDTensor.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
+        low(int): the lower bound of the random integers.
+        high(int): the upper bound of the random integers.

    Returns:
        A fluid LoDTensor object with tensor data and lod info. 

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -325,14 +325,14 @@ class Auc(MetricBase):
    """

    def __init__(self, name, curve='ROC', num_thresholds=200):
-        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        super(Auc, self).__init__(name=name)
        self._curve = curve
        self._num_thresholds = num_thresholds
        self._epsilon = 1e-6
-        self.tp_list = np.ndarray((num_thresholds, ))
-        self.fn_list = np.ndarray((num_thresholds, ))
-        self.tn_list = np.ndarray((num_thresholds, ))
-        self.fp_list = np.ndarray((num_thresholds, ))
+        self.tp_list = np.zeros((num_thresholds, ))
+        self.fn_list = np.zeros((num_thresholds, ))
+        self.tn_list = np.zeros((num_thresholds, ))
+        self.fp_list = np.zeros((num_thresholds, ))

    def update(self, labels, predictions, axis=1):
        if not _is_numpy_(labels):
@@ -350,12 +350,12 @@ class Auc(MetricBase):
            tp, fn, tn, fp = 0, 0, 0, 0
            for i, lbl in enumerate(labels):
                if lbl:
-                    if predictions[i, 0] >= thresh:
+                    if predictions[i, 1] >= thresh:
                        tp += 1
                    else:
                        fn += 1
                else:
-                    if predictions[i, 0] >= thresh:
+                    if predictions[i, 1] >= thresh:
                        fp += 1
                    else:
                        tn += 1

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -26,16 +26,87 @@ def simple_img_conv_pool(input,
                         filter_size,
                         pool_size,
                         pool_stride,
-                         act,
-                         param_attr=None,
+                         pool_padding=0,
                         pool_type='max',
+                         global_pooling=False,
+                         conv_stride=1,
+                         conv_padding=0,
+                         conv_dilation=1,
+                         conv_groups=1,
+                         param_attr=None,
+                         bias_attr=None,
+                         act=None,
                         use_cudnn=True,
                         use_mkldnn=False):
+    """
+    The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            feature channel.
+        filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise,
+            the filter_size_H = filter_size_W = filter_size.
+        pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W).
+            Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+        pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or
+            tuple, it must contain two integers, (pool_padding_H, pool_padding_W).
+            Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
+            pool_size and pool_padding while be ignored. Default False
+        conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a
+            list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
+            the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or  tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
+            Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
+        conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is
+            a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
+            Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
+        conv_groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act (str): Activation type for Conv2d. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The result of input after Convolution2d and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.simple_img_conv_pool(input=img,
+                                                        filter_size=5,
+                                                        num_filters=20,
+                                                        pool_size=2,
+                                                        pool_stride=2,
+                                                        act="relu")
+    """
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
+        stride=conv_stride,
+        padding=conv_padding,
+        dilation=conv_dilation,
+        groups=conv_groups,
        param_attr=param_attr,
+        bias_attr=bias_attr,
        act=act,
        use_cudnn=use_cudnn,
        use_mkldnn=use_mkldnn)
@@ -45,6 +116,8 @@ def simple_img_conv_pool(input,
        pool_size=pool_size,
        pool_type=pool_type,
        pool_stride=pool_stride,
+        pool_padding=pool_padding,
+        global_pooling=global_pooling,
        use_cudnn=use_cudnn,
        use_mkldnn=use_mkldnn)
    return pool_out
@@ -60,11 +133,65 @@ def img_conv_group(input,
                   conv_with_batchnorm=False,
                   conv_batchnorm_drop_rate=0.0,
                   pool_stride=1,
-                   pool_type=None,
+                   pool_type="max",
                   use_cudnn=True,
                   use_mkldnn=False):
    """
-    Image Convolution Group, Used for vgg net.
+    The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
+    and Pool2d. According to the input arguments, img_conv_group will do serials of
+    computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
+    result to Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
+        pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
+        conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
+        conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
+            Default: None.
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
+            If conv_with_batchnorm is a list, its length must be equal to the length of
+            conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
+            Conv2d Layer follows a BatchNorm. Default False.
+        conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
+            after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
+            equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
+            Layers is conv_batchnorm_drop_rate. Default 0.0.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H,
+            pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+            Default 1.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The final result after serial computation using Convolution2d,
+            BatchNorm, DropOut, and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.img_conv_group(input=img,
+                                                  num_channels=3,
+                                                  conv_padding=1,
+                                                  conv_num_filter=[3, 3],
+                                                  conv_filter_size=3,
+                                                  conv_act="relu",
+                                                  pool_size=2,
+                                                  pool_stride=2)
    """
    tmp = input
    assert isinstance(conv_num_filter, list) or \
@@ -74,6 +201,7 @@ def img_conv_group(input,
        if not hasattr(obj, '__len__'):
            return [obj] * len(conv_num_filter)
        else:
+            assert len(obj) == len(conv_num_filter)
            return obj

    conv_padding = __extend_list__(conv_padding)
@@ -119,6 +247,39 @@ def sequence_conv_pool(input,
                       param_attr=None,
                       act="sigmoid",
                       pool_type="max"):
+    """
+    The sequence_conv_pool is composed with Sequence Convolution and Pooling.
+
+    Args:
+        input (Variable): The input of sequence_conv, which supports variable-time
+            length input sequence. The underlying of input is a matrix with shape
+            (T, N), where T is the total time steps in this mini-batch and N is
+            the input_hidden_size
+        num_filters(int): The number of filter.
+        filter_size (int): The filter size.
+        param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None.
+        act (str): Activation type for Sequence_conv Layer. Default: "sigmoid".
+        pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
+            average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
+            Default :math:`max`.
+
+    Return:
+        Variable: The final result after Sequence Convolution and Pooling.
+
+    Examples:
+        .. code-block:: python
+
+            input_dim = len(word_dict)
+            emb_dim = 128
+            hid_dim = 512
+            data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
+            seq_conv = fluid.nets.sequence_conv_pool(input=emb,
+                                                     num_filters=hid_dim,
+                                                     filter_size=3,
+                                                     act="tanh",
+                                                     pool_type="sqrt")
+    """
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
@@ -132,9 +293,9 @@ def sequence_conv_pool(input,

 def glu(input, dim=-1):
    """
-    The gated linear unit composed by split, sigmoid activation and elementwise
-    multiplication. Specifically, Split the input into two equal sized parts
-    :math:`a` and :math:`b` along the given dimension and then compute as
+    The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
+    multiplication. Specifically, Split the input into two equal sized parts,
+    :math:`a` and :math:`b`, along the given dimension and then compute as
    following:

        .. math::
@@ -147,16 +308,16 @@ def glu(input, dim=-1):
    Args:
        input (Variable): The input variable which is a Tensor or LoDTensor.
        dim (int): The dimension along which to split. If :math:`dim < 0`, the
-            dimension to split along is :math:`rank(input) + dim`.
+            dimension to split along is :math:`rank(input) + dim`. Default -1.

    Returns:
-        Variable: The Tensor variable with half the size of input.
+        Variable: Variable with half the size of input.

    Examples:
        .. code-block:: python

-            # x is a Tensor variable with shape [3, 6, 9]
-            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
+            data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32")
+            output = fluid.nets.glu(input=data, dim=1)  # shape of output: [3, 3, 9]
    """

    a, b = layers.split(input, num_or_sections=2, dim=dim)
@@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries,
    <https://arxiv.org/pdf/1706.03762.pdf>`_.

    Args:
-
        queries (Variable): The input variable which should be a 3-D Tensor.
        keys (Variable): The input variable which should be a 3-D Tensor.
        values (Variable): The input variable which should be a 3-D Tensor.
        num_heads (int): Head number to compute the scaled dot product
-                         attention. Default value is 1.
+            attention. Default: 1.
        dropout_rate (float): The dropout rate to drop the attention weight.
-                              Default value is 0.
+            Default: 0.0.

    Returns:
-
-        Variable: A 3-D Tensor computed by multi-head scaled dot product \
-                  attention.
+        Variable: A 3-D Tensor computed by multi-head scaled dot product\
+            attention.

    Raises:
-
        ValueError: If input queries, keys, values are not 3-D Tensors.

-    NOTE:
+    NOTES:
        1. When num_heads > 1, three linear projections are learned respectively
-        to map input queries, keys and values into queries', keys' and values'.
-        queries', keys' and values' have the same shapes with queries, keys
-        and values.
-
-        1. When num_heads == 1, scaled_dot_product_attention has no learnable
-        parameters.
+           to map input queries, keys and values into queries', keys' and values'.
+           queries', keys' and values' have the same shapes with queries, keys
+           and values.
+        2. When num_heads == 1, scaled_dot_product_attention has no learnable
+           parameters.

    Examples:
        .. code-block:: python

-            # Suppose q, k, v are Tensors with the following shape:
-            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-
-            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            queries = fluid.layers.data(name="queries",
+                                        shape=[3, 5, 9],
+                                        dtype="float32",
+                                        append_batch_size=False)
+            queries.stop_gradient = False
+            keys = fluid.layers.data(name="keys",
+                                     shape=[3, 6, 9],
+                                     dtype="float32",
+                                     append_batch_size=False)
+            keys.stop_gradient = False
+            values = fluid.layers.data(name="values",
+                                       shape=[3, 6, 10],
+                                       dtype="float32",
+                                       append_batch_size=False)
+            values.stop_gradient = False
+            contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
            contexts.shape  # [3, 5, 10]
    """
    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -27,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy


 class ParallelExecutor(object):
+    """
+    ParallelExecutor can run program in parallel.
+
+    Args:
+        use_cuda (bool): Whether to use CUDA or not.
+        loss_name (str): The loss name must set in training. Default None.
+        main_program (Program): The program that need to run, if not provided,
+            then default_main_program will be used. Default None.
+        share_vars_from(ParallelExecutor): If provied, it will share variables
+            from the specified ParallelExecutor. Default None.
+        num_trainers(int): If greater than 1, NCCL will be initialized with
+            multiple rank of nodes, each node should have same number of GPUs.
+            Distributed training will be enabled then. Default 1.
+        trainer_id(int: Must use together with num_trainers. trainer_id is the
+            "rank" of current node starts from 0. Default 0.
+
+    Returns:
+        ParallelExecutor: The initialized ParallelExecutor object.
+
+    Raises:
+        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+
+    Examples:
+        .. code-block:: python
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+          test_loss, = test_exe.run([loss.name], feed=feed_dict)
+    """
+
    def __init__(self,
                 use_cuda,
                 loss_name=None,
@@ -37,42 +71,6 @@ class ParallelExecutor(object):
                 num_trainers=1,
                 trainer_id=0,
                 **kwargs):
-        """
-        ParallelExecutor can run program in parallel.
-
-        Args:
-            use_cuda(bool): Whether to use CUDA or not.
-            loss_name(str, default None): The loss name must set in training.
-            main_program(Program, default None): The program that need to run,
-                if not provided, then default_main_program will be used.
-            share_vars_from(ParallelExecutor, default None): If provied,
-                it will share variables from the specified ParallelExecutor.
-            num_trainers(int, default 1): If greater than 1, NCCL will be
-                initialized with multpile rank of nodes, each node should have
-                same number of GPUs. Distributed training will be enabled then.
-            trainer_id(int, default 0): Must use together with num_trainers.
-                trainer_id is the "rank" of current node starts from 0.
-
-        Returns:
-            A ParallelExecutor object.
-
-        Raises:
-            TypeError: If share_vars_from is provided, but not ParallelExecutor
-                object.
-
-        Examples:
-            .. code-block:: python
-
-              train_exe = fluid.ParallelExecutor(
-                  use_cuda=True, loss_name=loss.name)
-              test_exe = fluid.ParallelExecutor(
-                  use_cuda=True,
-                  main_program=test_program,
-                  share_vars_from=train_exe)
-
-              train_loss, = train_exe.run([loss.name], feed=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed=feed_dict)
-        """
        if len(kwargs) != 0:
            err_msg = ""
            for key in kwargs:
@@ -131,10 +129,16 @@ class ParallelExecutor(object):
        main = main_program
        main = main if main else framework.default_main_program()
        scope = executor.global_scope()
+        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
+        # train program, call self.bcast_param() at the end of each mini-batch.
+        self.is_dist = True if "recv" in [
+            op.type for op in main.global_block().ops
+        ] else False

        if share_vars_from and not isinstance(share_vars_from,
                                              ParallelExecutor):
            raise TypeError("share_vars_from must be ParallelExecutor.")
+
        local_scopes = share_vars_from.executor.local_scopes(
        ) if share_vars_from else []

@@ -166,12 +170,14 @@ class ParallelExecutor(object):
        element in the list will be copied to each device directly.

        For example, if the feed is a dict:
+
        >>> exe = ParallelExecutor()
        >>> # the image will be splitted into devices. If there is two devices
        >>> # each device will process an image with shape (24, 1, 28, 28)
        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})

        For example, if the feed is a list:
+
        >>> exe = ParallelExecutor()
        >>> # each device will process each element in the list.
        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
@@ -182,18 +188,40 @@ class ParallelExecutor(object):
        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
        >>>              ])

-
        Args:
            fetch_list(list): The fetched variable names
            feed(list|dict|None): The feed variables. If the feed is a dict,
                tensors in that dict will be splitted into each devices. If
                the feed is a list, each element of the list will be copied
-                to each device.
+                to each device. Default None.
            feed_dict: Alias for feed parameter, for backward compatibility.
-                This parameter is deprecated.
+                This parameter has been deprecated. Default None.
+
+        Returns:
+            List: The fetched result list.
+
+        Raises:
+            ValueError: If the feed is a list, but its length is not equal the
+                length of active places, or its element's is not dict.
+
+        NOTES:
+            1. If the feed's type is dict, the number of data that feeds to
+               ParallelExecutor must be bigger than active places. Otherwise,
+               it will throw exception from C++ side. Special attention should be
+               paid to check whether the last batch of the dataset is bigger
+               than active places.
+            2. If active places are more than one, the fetch results for each
+               variable is a list, and each element of this list is the variable of
+               respective active place.

-        Returns: fetched result list.
+        Examples:
+            .. code-block:: python

+                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            loss_name=avg_cost.name,
+                                            main_program=fluid.default_main_program())
+                loss = pe.run(feed=feeder.feed(cur_batch),
+                              fetch_list=[avg_cost.name]))
        """
        if feed is None and feed_dict is not None:
            feed = feed_dict
@@ -238,9 +266,17 @@ class ParallelExecutor(object):
        fetch_var_name = '@FETCHED_VAR_NAME@'
        self.executor.run(fetch_list, fetch_var_name)
        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if self.is_dist:
+            self.bcast_params()
+
        return [arr[i] for i in range(len(arr))]

    def bcast_params(self):
+        """
+        Broadcast the parameters to other devices. It is used during
+        distributed training.
+        """
        self.executor.bcast_params(set(self.persistable_vars))

    @property

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -22,6 +22,35 @@ __all__ = [


 class ParamAttr(object):
+    """
+    Parameter attributes object. To fine-tuning network training process, user
+    can set parameter's attributes to control training details. Such as learning rate,
+    regularization, trainable, do_model_average and the method to initialize param.
+
+
+    Args:
+        name(str): The parameter's name. Default None.
+        initializer(Initializer): The method to initial this parameter. Default None.
+        learning_rate(float): The parameter's learning rate. The learning rate when
+            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
+            Default 1.0.
+        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
+        trainable(bool): Whether this parameter is trainable. Default True.
+        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
+            gradient. Default None.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = fluid.ParamAttr(name="fc_weight",
+                                            learning_rate=0.5,
+                                            regularizer=fluid.L2Decay(1.0),
+                                            trainable=True)
+            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
+    """
+
    def __init__(self,
                 name=None,
                 initializer=None,
@@ -29,7 +58,7 @@ class ParamAttr(object):
                 regularizer=None,
                 trainable=True,
                 gradient_clip=None,
-                 do_model_average=None):
+                 do_model_average=False):
        self.name = name
        self.initializer = initializer
        self.learning_rate = learning_rate
@@ -39,6 +68,16 @@ class ParamAttr(object):
        self.model_average = do_model_average

    def set_default_initializer(self, initializer):
+        """
+        Set the default initializer, the initializer should be Constant,
+        Uniform, Normal, Xavier, MSRA.
+
+        Args:
+            initializer(Initializer): the initializer to set.
+
+        Returns:
+            None
+        """
        if initializer is None:
            if self.initializer is None:
                raise ValueError("ParamAttr.initializer is not set")
@@ -50,13 +89,45 @@ class ParamAttr(object):
        self.initializer = initializer

    def set_default_param_initializer(self):
+        """
+        Set the default initializer for the parameter with Xavier.
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
        self.set_default_initializer(Xavier())

    def set_default_bias_initializer(self):
+        """
+        Set the default initializer for the bias with Constant(0.0).
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
        self.set_default_initializer(Constant(0.0))

    @staticmethod
    def to_attr(arg):
+        """
+        Create ParamAttr[s].
+
+        Args:
+            arg: Arguments to initialize ParamAttr[s]. arg's type can be
+                str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr,
+                bool, ParamAttr, or a list of above type.
+
+        Returns:
+            ParamAttr[s]: ParamAttr[s] initialized with arg.
+
+        Raises:
+            arg can not initialize a ParamAttr.
+        """
        if arg is None:
            return ParamAttr()
        elif isinstance(arg, list) or isinstance(arg, tuple):
@@ -75,6 +146,15 @@ class ParamAttr(object):
            raise TypeError("{0} cast to ParamAttr".format(type(arg)))

    def to_kwargs(self, with_initializer=False):
+        """
+        Returns the attributes of this parameter.
+
+        Args:
+            with_initializer(bool): Whether to add initializer attr.
+
+        Returns:
+            Parameter attributes(map): The attributes of this parameter.
+        """
        kwargs = {
            'name': self.name,
            'optimize_attr': {
@@ -92,9 +172,27 @@ class ParamAttr(object):

 class WeightNormParamAttr(ParamAttr):
    """
-    Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except
-    which to normalize.
+    Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
+    in a neural network that decouples the length of those weight vectors from
+    their direction. Weight Norm has been implemented as discussed in this
+    paper: `Weight Normalization: A Simple Reparameterization to Accelerate
+    Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Args:
+        dim(list): The parameter's name. Default None.
+        kwargs: Any field in ParamAttr. Default None.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data,
+                                 size=1000,
+                                 param_attr=WeightNormParamAttr(
+                                      dim=None,
+                                      name='weight_norm_param'))
+
    """
    # List to record the parameters reparameterized by weight normalization.
    # If these parameters are treated as Variable rather than Parameter,

--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -36,6 +36,45 @@ def convert_reader_to_recordio_file(
        compressor=core.RecordIOWriter.Compressor.Snappy,
        max_num_records=1000,
        feed_order=None):
+    """
+    Convert a Python Reader to a recordio file.
+
+    Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
+    details.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>> import paddle
+        >>>
+        >>> tmp_program = fluid.Program()
+        >>> with fluid.program_guard(tmp_program):
+        >>>     img = fluid.layers.data(name='img', shape=[784])
+        >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+        >>> # mnist.recordio will be generated in current directory
+        >>> fluid.recordio_writer.convert_reader_to_recordio_file(
+        >>>                     filename="mnist.recordio",
+        >>>                     reader_creator=paddle.batch(mnist.train(), batch_size=32),
+        >>>                     feeder=feeder)
+
+    Args:
+        filename(str): The recordio filename.
+        reader_creator(callable): The Python Reader Creator. See
+            :ref:`api_guide_python_reader`.
+        feeder(DataFeeder): The DataFeeder instance. Used to convert
+            :code:`reader_creator` to :code: `lod_tensor`
+        compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or
+            fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy`
+            by default.
+        max_num_records(int): Maximum number of records in one chuck. Each record
+            is each return value from reader function
+        feed_order(list): The order of variable names that the reader returns
+
+    Returns:
+        int: the number of record that saved.
+    """
    if feed_order is None:
        feed_order = feeder.feed_names
    counter = 0
@@ -58,6 +97,17 @@ def convert_reader_to_recordio_files(
        compressor=core.RecordIOWriter.Compressor.Snappy,
        max_num_records=1000,
        feed_order=None):
+    """
+    convert a python reader to many recordio files.
+
+    This API is basically same as :code:`convert_reader_to_recordio_file`,
+    instead of it will create many recordio files. Each file contains at
+    most :code:`batch_per_file` records.
+
+    Please reference
+    :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more
+    details.
+    """
    if feed_order is None:
        feed_order = feeder.feed_names
    f_name, f_ext = os.path.splitext(filename)

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -194,16 +194,16 @@ def train(word_dict,
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True):
    if is_local:
        train_loop(framework.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -151,16 +151,16 @@ def train(nn_type,
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core


 def bilinear_interp_np(input, out_h, out_w, out_size):
@@ -45,9 +46,9 @@ def bilinear_interp_np(input, out_h, out_w, out_size):

            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
                                        w1lambda*input[:, :, h, w+wid]) + \
-                              h1lambda*(w2lambda*input[:, :, h+hid, w] +
-                                        w1lambda*input[:, :, h+hid, w+wid])
-    return out.astype("float32")
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+    return out.astype(input.dtype)


 class TestBilinearInterpOp(OpTest):
@@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp):
        self.out_size = np.array([65, 129]).astype("int32")


+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -33,23 +33,59 @@ __all__ = [


 class BeginEpochEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
    def __init__(self, epoch_id):
        self.epoch = epoch_id


 class EndEpochEvent(object):
+    """
+    The end of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
    def __init__(self, epoch_id):
        self.epoch = epoch_id


 class BeginStepEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+    """
+
    def __init__(self, epoch_id, step_id):
        self.epoch = epoch_id
        self.step = step_id
        self.fetch_metrics = True
+        """
+        If fetch_metrics is true, the metrics will be fetched at the 
+        EndStepEvent. Default is True.
+        """


 class EndStepEvent(object):
+    """
+    The end of a training step.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+        metrics(list): A list of fetched tensor. The order of this list is same
+            as the :code:`train_func` returns.
+    """
+
    def __init__(self, epoch_id, step_id, metrics):
        self.epoch = epoch_id
        self.step = step_id
@@ -57,6 +93,27 @@ class EndStepEvent(object):


 class CheckpointConfig(object):
+    """
+    Parameter object for :code:`fluid.io.save_checkpoint` and
+    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
+
+    Args:
+        checkpoint_dir(str): Directory path to save check point. Default is the
+            current directory.
+
+        max_num_checkpoints(int): The max number of local check points.
+        epoch_interval(int): Every number of epoch to save check point.
+        step_interval(int): Every number of step to save check point.
+
+    Examples:
+        >>> config = fluid.CheckpointConfig("./checkpoints")
+        >>> trainer = fluid.Trainer(train_func=train_program,
+        >>>                         place=place,
+        >>>                         optimizer_func=optimizer_func,
+        >>>                         checkpoint_config=config)
+        >>> trainer.train(...)
+    """
+
    def __init__(self,
                 checkpoint_dir=None,
                 max_num_checkpoints=3,
@@ -113,11 +170,62 @@ def check_and_get_place(place):

 class Trainer(object):
    """
+    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
+    simple neural network easily.
+
+    This API takes a :code:`train_func`. A :code:`train_func` is a function that
+    return loss as it first return value. The reset value can be fetched by
+    EndStepEvent.metrics
+
+    This API also takes a :code:`optimizer_func` that will return an optimizer
+    instance.
+
+    For example, to train a MLP for MNIST dataset, the sample program is
+
+    >>> import paddle.fluid as fluid
+    >>>
+    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
+    >>>     hidden = image
+    >>>     for layer_size in layer_sizes:
+    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
+    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
+    >>>
+    >>> def train_mnist_mlp():
+    >>>     img = fluid.layers.data(name='image', shape=[784])
+    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    >>>     prediction = mlp(img)
+    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
+    >>>
+    >>> def optimizer():
+    >>>     return fluid.optimizer.Adam()
+    >>>
+    >>> trainer = Trainer(train_func=train_mnist_mlp,
+    >>>                   optimizer_func=optimizer,
+    >>>                   place=fluid.CUDAPlace(0),
+    >>>                   parallel=True)
+    >>>
+    >>> def train_callback(event):
+    >>>     if isinstance(event, fluid.EndStepEvent):
+    >>>         print "Epoch ID", event.epoch, "Step ID",\
+    >>>             event.step, "AvgLoss", event.metrics[0]
+    >>>     elif isinstance(event, fluid.EndEpochEvent):
+    >>>         trainer.save_params("./model_{0}".format(event.epoch))
+    >>>
+    >>> trainer.train(num_epochs=100, event_handler=train_callback)
+
+    For more example, please see :ref:`api_guide_high_level_api`.
+

    Args:
-        train_func(callable): A function which will return loss. The loss must be a scalar.
+        train_func(callable): A function which will return loss. The loss must be
+            a scalar tensor.
        optimizer_func(callable): A function that returns an Optimizer object.
-        place: The device place of this trainer.
+        place(CUDAPlace|CPUPlace): The device place of this trainer. If
+            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
+            is a :code:`CUDAPlace`.
+        parallel(bool): True if use multiple devices.
+        checkpoint_config(CheckpointConfig): Configuration about how to save
+            checkpoints.
    """

    def __init__(self,
@@ -129,9 +237,6 @@ class Trainer(object):
                 checkpoint_config=None):
        self.__stop = False
        self.parallel = parallel
-        # 1. we need to generate a framework.Program by calling
-        # program_func. Reference: fluid.program_guard in
-        # test_word2vec.py

        # config for checkpoint
        # only chief worker will save variables
@@ -145,6 +250,10 @@ class Trainer(object):

        self.scope = core.Scope()

+        # 1. we need to generate a framework.Program by calling
+        # program_func. Reference: fluid.program_guard in
+        # test_word2vec.py
+
        self.startup_program = framework.Program()
        self.train_program = framework.Program()

@@ -277,17 +386,18 @@ class Trainer(object):

    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
        """
-        Train the model.
+        Start the train loop to train the model.

        Args:
-            num_epochs: The number of epoch. An epoch will process all data in reader
-            event_handler: The event handler. A function with type (ev:Event)->void
-            reader:
-            feed_order: Feeding order of reader. None will following the defining
+            num_epochs(int): The number of epoch. An epoch will process all data in reader
+            event_handler(callable): The event handler. A function with type (ev:Event)->void
+            reader(callable): A reader creator object. See also
+                :ref:`api_guide_python_reader` .
+            feed_order(list): Feeding order of reader. None will following the defining
                order in program

        Returns:
-
+            None
        """
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
        if training_role == "PSERVER":
@@ -307,16 +417,24 @@ class Trainer(object):
        Test the model on given test data

        Args:
-            reader: The reader that yields test data.
-            feed_order: Feeding order of reader. None will following the defining
-                order in program
+            reader(callable): The reader that yields test data.
+            feed_order(list): Feeding order of reader. None will following the
+                defining order in program
        """

        return self._test_by_executor(reader, feed_order,
                                      self.train_func_outputs)

    def save_params(self, param_path):
-        # reference: save_persistables in io.py
+        """
+        Save all parameters into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+
+        Returns:
+            None
+        """
        with self._prog_and_scope_guard():
            exe = executor.Executor(self.place)
            io.save_persistables(exe, dirname=param_path)

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -12,14 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Transpile the program to distributed data-parallelism programs.
-The main_program will be transformed to use a remote parameter server
-to do parameter optimization. And the optimization graph will be put
-into a parameter server program.
-
-Use different methods to split trainable variables to different
-parameter servers.
-
 Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
@@ -117,129 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
    return blocks


-class DistributeTranspiler:
-    def _has_distributed_lookup_table(self):
-        # process lookup_table_op
-        # 1. check all lookup_table_op is distributed
-        # 2. check all lookup_table_op share the same table.
-        distributed_lookup_table_ops = []
-        # support only one distributed_lookup_table now
-        self.table_name = None
-        for op in self.origin_program.global_block().ops:
-            if op.type == LOOKUP_TABLE_TYPE:
-                if op.attrs['is_distributed'] is True:
-                    if self.table_name is None:
-                        self.table_name = op.input("W")[0]
-                    if self.table_name != op.input("W")[0]:
-                        raise RuntimeError("all distributed lookup_table_ops"
-                                           " should have only one table")
-                    distributed_lookup_table_ops.append(op)
-                else:
-                    if self.table_name is not None:
-                        assert op.input("W")[0] != self.table_name
-
-        return len(distributed_lookup_table_ops) > 0
-
-    def _update_dist_lookup_table_vars(self, param_list, grad_list,
-                                       params_grads):
-        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
-        # update self.table_param_grad and self.trainer_side_table_grad_list
-        program = self.origin_program
-        if self.has_distributed_lookup_table:
-            param_list = [
-                param for param in param_list if param.name != self.table_name
-            ]
-            grad_list = [
-                grad for grad in grad_list
-                if grad.name != grad_var_name(self.table_name)
-            ]
-            self.table_param_grad = [
-                param_grad for param_grad in params_grads
-                if param_grad[0].name == self.table_name
-            ][0]
-            table_grad_var = self.table_param_grad[1]
-            if self.sync_mode:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.trainer_%d.pserver_%d" %
-                        (table_grad_var.name, self.trainer_id, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-            else:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.pserver_%d" % (table_grad_var.name, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-        return param_list, grad_list
-
-    def _init_splited_vars(self, slice_var_up):
-        # update these mappings for further transpile:
-        # 1. param_var_mapping: param var name -> [splited params vars]
-        # 2. grad_var_mapping: grad var name -> [splited grads vars]
-        # 3. grad_param_mapping: grad.blockx -> param.blockx
-        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
-
-        param_list = []
-        grad_list = []
-        param_grad_set = set()
-        for p, g in self.params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            if p.name not in param_grad_set:
-                param_list.append(p)
-                param_grad_set.add(p.name)
-            if g.name not in param_grad_set:
-                grad_list.append(g)
-                param_grad_set.add(g.name)
-
-        param_list, grad_list = self._update_dist_lookup_table_vars(
-            param_list, grad_list, self.params_grads)
-
-        if slice_var_up:
-            # when we slice var up into blocks, we will slice the var according to
-            # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
-            param_blocks = slice_variable(param_list,
-                                          len(self.pserver_endpoints))
-        else:
-            # when we do NOT slice var up into blocks, we will always slice params
-            # grads into one block.
-            grad_blocks = slice_variable(grad_list, 1)
-            param_blocks = slice_variable(param_list, 1)
-        assert (len(grad_blocks) == len(param_blocks))
-
-        # origin_varname -> [splited_var]
-        self.param_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program, param_blocks)
-        self.grad_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program,
-            grad_blocks,
-            add_trainer_suffix=self.trainer_num > 1)
-        self.grad_param_mapping = dict()
-        for g, p in zip(grad_blocks, param_blocks):
-            g_name, g_bid, _ = g.split(":")
-            p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
-                    self.param_var_mapping[p_name][int(p_bid)]
-
-        # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = dict()
-        [
-            self.param_grad_ep_mapping.update({
-                ep: {
-                    "params": [],
-                    "grads": []
-                }
-            }) for ep in self.pserver_endpoints
-        ]
+class DistributeTranspiler(object):
+    """
+    **DistributeTranspiler**
+
+    Convert the fluid program to distributed data-parallelism programs.
+
+    The main_program will be transformed to use a remote parameter server
+    to do parameter optimization. And the optimization graph will be put
+    into a parameter server program.
+
+    Examples:
+        .. code-block:: python
+
+           # Define your model before these codes.
+           port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+           pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+           eplist = []
+           for ip in pserver_ips.split(","):
+                eplist.append(':'.join([ip, port]))
+           pserver_endpoints = ",".join(eplist)
+           trainers = int(os.getenv("PADDLE_TRAINERS"))
+           current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+           trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+           role = os.getenv("PADDLE_TRAINING_ROLE")
+
+           t = distribute_transpiler.DistributeTranspiler()
+           t.transpile(
+                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+           if role == "PSERVER":
+                pserver_program = t.get_pserver_program(current_endpoint)
+                pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                                pserver_program)
+           elif role == "TRAINER":
+                trainer_program = t.get_trainer_program()
+    """

    def transpile(self,
                  trainer_id,
@@ -250,15 +154,20 @@ class DistributeTranspiler:
                  split_method=RoundRobin,
                  sync_mode=True):
        """
+        Run the transpiler.
+
        Args:
-            trainer_id(int): one unique id for each trainer in a job.
-            program(Program): program to transpile, default is default_main_program
-            pservers(string): parameter server endpoints like "m1:6174,m2:6174"
-            trainers(int): total number of workers/trainers in the job
-            split_method(PSDispatcher): A function to determin how to split variables
-                to different servers equally.
-            sync_mode(boolean): if sync_mode is set True, it means that dist transpiler
-                will transpile the program into sync_mode pserver and trainer program.
+            trainer_id (int): id for current trainer worker, if you have
+                n workers, the id may range from 0 ~ n-1
+            program (Program|None): program to transpile,
+                default is fluid.default_main_program().
+            pservers (str): comma separated ip:port string for the pserver
+                list.
+            trainers (int): number of trainers in the distributed job.
+            slice_var_up (bool): Do Tensor slice for pservers, default is True.
+            split_method (PSDispatcher): RoundRobin or HashName can be used
+                try to choose the best method to balance loads for pservers.
+            sync_mode (bool): Do sync training or not, default is True.
        """
        assert (split_method.__bases__[0] == PSDispatcher)
        if program is None:
@@ -385,6 +294,12 @@ class DistributeTranspiler:
            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)

    def get_trainer_program(self):
+        """
+        Get transpiled trainer side program.
+
+        Returns:
+            Program: trainer side program.
+        """
        # remove optimize ops and add a send op to main_program
        delete_ops(self.origin_program.global_block(), self.optimize_ops)
        # FIXME(typhoonzero): serialize once will fix error occurs when clone.
@@ -393,17 +308,19 @@ class DistributeTranspiler:

    def get_pserver_program(self, endpoint):
        """
-        Get pserver side program using the endpoint.
-        TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
-        NOTE: assume blocks of the same variable is not distributed
-        on the same pserver, only change param/grad varnames for
-        trainers to fetch.
+        Get parameter server side program.
+        
        Args:
-          endpoint(string): the endpoint for the current pserver instance.
-
-        Returns(Program): the pserver program
-
+            endpoint (str): current parameter server endpoint.
+        
+        Returns:
+            Program: the program for current parameter server to run.
        """
+        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
+        # NOTE: assume blocks of the same variable is not distributed
+        # on the same pserver, only change param/grad varnames for
+        # trainers to fetch.
+
        # step1
        pserver_program = Program()
        # step2: Create vars to receive vars at parameter servers.
@@ -481,7 +398,7 @@ class DistributeTranspiler:

        def __clone_lr_op_sub_block__(op, program, new_block, skip_sub_blks):
            if not op.has_attr('sub_block'):
-                return -1
+                return

            origin_block_desc = op.attr('sub_block')
            origin_block = self.origin_program.block(origin_block_desc.id)
@@ -587,11 +504,14 @@ class DistributeTranspiler:
        Get startup program for current parameter server.
        Modify operator input variables if there are variables that
        were split to several blocks.
-        Args:
-          endpoint(string): the endpoint for the current pserver instance.
-          pserver_program(Program): the program for pserver to execute.

-        Returns(Program): the startup program for pserver
+        Args:
+            endpoint (str): current pserver endpoint.
+            pserver_program (Program): call get_pserver_program first and
+                pass the result here.
+        
+        Returns:
+            Program: parameter server side startup program.
        """
        s_prog = Program()
        orig_s_prog = default_startup_program()
@@ -643,6 +563,129 @@ class DistributeTranspiler:

    # ====================== private transpiler functions =====================

+    def _has_distributed_lookup_table(self):
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in self.origin_program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        return len(distributed_lookup_table_ops) > 0
+
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+        return param_list, grad_list
+
+    def _init_splited_vars(self, slice_var_up):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        param_grad_set = set()
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
+
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)
+
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
+        assert (len(grad_blocks) == len(param_blocks))
+
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
+
+        # create mapping of endpoint -> split var to create pserver side program
+        self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
    # transpiler function for dis lookup_table
    def _replace_lookup_table_op_with_prefetch(self, program,
                                               pserver_endpoints):

--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -383,6 +383,16 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):


 def release_memory(input_program, skip_opt_set=None):
+    """
+    Modify the input program and insert :code:`delete_op` to early drop not used
+    variables. The modification will be performed inplace.
+
+    Notes: This is an experimental API and could be removed in next few
+    releases. Users should not use this API.
+
+    Args:
+        input_program(Program): The program will be inserted :code:`delete_op`.
+    """
    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.release_memory(skip_opt_set=skip_opt_set)
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -33,15 +33,21 @@ class PSDispatcher(object):

    def dispatch(self, varlist):
        """
-        :param varlist: a list of Variables
-        :return: a map of pserver endpoint -> varname 
+        Args:
+            varlist(list): a list of Variables
+        Returns:
+            a map of pserver endpoint -> varname
        """
        AssertionError("Interface has not been implemented.")


 class HashName(PSDispatcher):
    """
-      Hash variable names to several endpoints
+    Hash variable names to several endpoints using python
+    "hash()" function.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
    """

    def __init__(self, pserver_endpoints):
@@ -61,7 +67,11 @@ class HashName(PSDispatcher):

 class RoundRobin(PSDispatcher):
    """
-    Distribute variables to serveral endpoints.
+    Distribute variables to serveral endpoints using
+    RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
    """

    def __init__(self, pserver_endpoints):

--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -16,7 +16,7 @@ import collections
 import contextlib
 import sys

-__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator']
+__all__ = ['generate', 'switch', 'guard']


 class UniqueNameGenerator(object):

--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -291,6 +291,8 @@ class DocstringChecker(BaseChecker):
            True if successful otherwise False.
        """

+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
        find = False
        for t in node.body:
            if not isinstance(t, astroid.Return):
@@ -316,6 +318,8 @@ class DocstringChecker(BaseChecker):
        Returns:
            True if successful otherwise False.
        """
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
        args = []
        for arg in node.args.get_children():
            if (not isinstance(arg, astroid.AssignName)) \

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print all signature of a python module in alphabet order.
+
+Usage:
+    ./print_signature  "paddle.fluid" > signature.txt
+"""
+import importlib
+import inspect
+import collections
+import sys
+import pydoc
+
+member_dict = collections.OrderedDict()
+
+
+def visit_member(parent_name, member):
+    cur_name = ".".join([parent_name, member.__name__])
+    if inspect.isclass(member):
+        for name, value in inspect.getmembers(member):
+            if hasattr(value, '__name__') and (not name.startswith("_") or
+                                               name == "__init__"):
+                visit_member(cur_name, value)
+    elif callable(member):
+        try:
+            member_dict[cur_name] = inspect.getargspec(member)
+        except TypeError:  # special for PyBind method
+            member_dict[cur_name] = "  ".join([
+                line.strip() for line in pydoc.render_doc(member).split('\n')
+                if "->" in line
+            ])
+
+    else:
+        raise RuntimeError("Unsupported generate signature of member, type {0}".
+                           format(str(type(member))))
+
+
+def visit_all_module(mod):
+    for member_name in (
+            name
+            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
+            if not name.startswith("_")):
+        instance = getattr(mod, member_name, None)
+        if instance is None:
+            continue
+        if inspect.ismodule(instance):
+            visit_all_module(instance)
+        else:
+            visit_member(mod.__name__, instance)
+
+
+visit_all_module(importlib.import_module(sys.argv[1]))
+
+for name in member_dict:
+    print name, member_dict[name]