Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into python_data_feeding

28ff4bdd · sneaxiy · 882a9327 · 1d7e60fd · 28ff4bdd · 28ff4bdd
48 changed file
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
        return train_program, fluid.default_startup_program()
    else:
        raise ValueError(
-            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )

--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -108,10 +108,10 @@ def gen_job():
    tn_container["ports"][0]["containerPort"] = spreadport
    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
@@ -167,16 +167,22 @@ def gen_job():
    tn_container["volumeMounts"] = volumeMounts
    ps_container["env"] = envs
-    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    ps_container["env"].append({
+        "name": "PADDLE_TRAINING_ROLE",
+        "value": "PSERVER"
+    })
    tn_container["env"] = envs
    if args.disttype == "pserver":
        tn_container["env"].append({
-            "name": "TRAINING_ROLE",
+            "name": "PADDLE_TRAINING_ROLE",
            "value": "TRAINER"
        })
    elif args.disttype == "nccl2" or args.disttype == "local":
        # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "WORKER"
+        })
    os.mkdir(args.jobname)
    if args.disttype == "pserver":

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(

--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
 第二步，启动Parameter Server：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
 ```
 执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
 第三步，启动Trainer：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
 ```
 由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。

--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
           ret_list.append(f)
   return ret_list
-trainers = int(os.getenv("TRAINERS"))
+trainers = int(os.getenv("PADDLE_TRAINERS"))
-trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
 data_file = fluid.layers.io.open_files(
    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
    thread_num=1,

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -143,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 )DOC";
@@ -385,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 STanh Activation Operator.
-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 )DOC");
  }

--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Detection mAP evaluate operator.
 The general steps are as follows. First, calculate the true positive and
- false positive according to the input of detection and labels, then
+false positive according to the input of detection and labels, then
- calculate the mAP evaluate value.
+calculate the mAP evaluate value.
- Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+Supporting '11 point' and 'integral' mAP algorithm. Please get more information
- from the following articles:
+from the following articles:
- https://sanchom.wordpress.com/tag/average-precision/
+https://sanchom.wordpress.com/tag/average-precision/
- https://arxiv.org/abs/1512.02325
+https://arxiv.org/abs/1512.02325
 )DOC");
  }

--- a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include "paddle/fluid/operators/mean_op.h"
+namespace paddle {
+namespace operators {
+using framework::DataLayout;
+template <typename T>
+class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+    // The format of output is set as the mkldnn's format
+    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
+    tensor->set_layout(DataLayout::kMKLDNN);
+    tensor->set_format(mkldnn::memory::format::oihw);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::GaussianMKLDNNKernel<float>);
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
    return framework::OpKernelType(
        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.device_context(), layout, library);
  }
 };
@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                 "(int, default 5(FP32)) "
                 "Output data type.")
        .SetDefault(framework::proto::VarType::FP32);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 GaussianRandom Operator.

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -22,43 +22,24 @@ namespace paddle {
 namespace operators {
 namespace math {
-template <typename T>
-__device__ T upper_bound(const T* first, T count, T val) {
-  const T* orig = first;
-  const T* it = nullptr;
-  T step = 0;
-  while (count > 0) {
-    it = first;
-    step = count / 2;
-    it += step;
-    if (!(val < *it)) {
-      first = ++it;
-      count -= step + 1;
-    } else {
-      count = step;
-    }
-  }
-  return first - orig;
-}
 template <typename T>
 __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                             const int output_rows, const int output_cols,
                             T* output) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
-  int curr_offset = input_cols[segment];
-  int curr_segment = segment;
  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
+    int curr_col_offset = input_cols[curr_segment + 1];
-    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+    while (curr_col_offset <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
    }
    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
    T* input_ptr = inputs[curr_segment];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
@@ -89,14 +70,14 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
                                 const int in_col, const int* out_cols,
                                 int out_cols_size, T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
+  int curr_segment = 0;
-  int curr_offset = out_cols[segment];
+  int curr_offset = out_cols[0];
-  int curr_segment = segment;
  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
+    int curr_col_offset = out_cols[curr_segment + 1];
-    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
+    while (curr_col_offset <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
    }
    int local_col = tid_x - curr_offset;
@@ -228,7 +209,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    outputs_cols[0] = 0;
    for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i)->numel() / out_row;
+      int t_col = ref_inputs.at(i)->numel() / out_row;
      if (sameShape) {
        if (t_col != out0_col) sameShape = false;
      }

--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {
          auto sum_op = framework::OpRegistry::CreateOp(
              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
          sum_op->Run(cur_scope, place);
          cur_scope.Rename(new_inside_name, inside_grad_name);

--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*Licensed under the Apache License, Version 2.0(the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::CPUDeviceContext;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+using mkldnn::reorder;
+using platform::to_void_cast;
+template <typename T>
+class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto in_vars = ctx.MultiInputVar("X");
+    const int N = in_vars.size();
+    auto out_var = ctx.OutputVar("Out");
+    bool in_place = out_var == in_vars[0];
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoDTensor* output = ctx.Output<LoDTensor>("Out");
+      T* output_data = output->mutable_data<T>(ctx.GetPlace());
+      std::vector<int> dst_tz = framework::vectorize2int(output->dims());
+      auto src_tz = dst_tz;
+      memory::format output_format{memory::format::format_undef};
+      std::vector<float> scales;
+      std::vector<memory::primitive_desc> srcs_mpd;
+      std::vector<mkldnn::memory> srcs_mem;
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
+      auto& input0 = in_vars[0]->Get<LoDTensor>();
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
+      memory::format input_format = input0.format();
+      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::x;
+      }
+      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::nc;
+      }
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
+        auto& input = in_vars[i]->Get<LoDTensor>();
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
+        if (input.numel() == 0) {
+          continue;
+        }
+        const T* input_data = input.data<T>();
+        auto src_md =
+            memory::desc(src_tz, memory::data_type::f32, input_format);
+        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
+        auto src_mem = memory(src_mpd, to_void_cast(input_data));
+        srcs_mpd.push_back(src_mpd);
+        srcs_mem.push_back(src_mem);
+        scales.push_back(1.0);
+      }
+      auto dst_md =
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
+      std::shared_ptr<memory> dst_mem;
+      if (in_place) {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+      } else {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+      }
+      std::vector<mkldnn::primitive::at> inputs;
+      for (size_t i = 0; i < srcs_mem.size(); ++i) {
+        inputs.push_back(srcs_mem[i]);
+      }
+      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
+      primitive reorder_prim;
+      std::shared_ptr<memory> target_mem;
+      if (in_place) {
+        output_format = input_format;
+        target_mem.reset(new memory(
+            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
+            output_data));
+        reorder_prim = reorder(*dst_mem, *target_mem);
+      }
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      if (in_place) pipeline.push_back(reorder_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto& rows = in_sel0.rows();
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+      auto* out = ctx.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto* out_value = out->mutable_value();
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+      out_value->Resize(framework::make_ddim(in_dim));
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
+      out_value->mutable_data<T>(ctx.GetPlace());
+      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() == 0) {
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
+      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    ctx.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::SumMKLDNNOpKernel<float>);
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    auto x_vars = ctx.MultiInputVar("X");
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
    if (x_vars[0]->IsType<framework::LoDTensor>()) {
      int dtype = -1;
      for (auto& x_var : x_vars) {
@@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel {
                        "Sum operator should have at least one tensor");
      return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(dtype),
+          static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
-          ctx.device_context());
+          layout, library);
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
      for (auto& var : x_vars) {
        auto& value = var->Get<framework::SelectedRows>().value();
        if (value.IsInitialized()) {
          return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context());
+                                         ctx.device_context(), layout, library);
        }
      }
      // if input sparse vars are not initialized, use an default kernel type.
      return framework::OpKernelType(framework::proto::VarType::FP32,
-                                     ctx.device_context());
+                                     ctx.device_context(), layout, library);
    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
      for (auto& x_var : x_vars) {
        auto& array = x_var->Get<framework::LoDTensorArray>();
        for (auto& each : array) {
          if (each.numel() != 0) {
            return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context());
+                                           ctx.device_context(), layout,
+                                           library);
          }
        }
      }
@@ -116,6 +133,9 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
        .AsDuplicable();
    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Sum operator.
@@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
                  framework::BlockDesc* block) const override {
    auto& inputs = op_desc.Input("X");
    auto var_type = framework::proto::VarType::SELECTED_ROWS;
    for (auto& name : op_desc.Input("X")) {
      VLOG(10) << name << " "
               << block->FindRecursiveOrCreateVar(name).GetType();
@@ -206,6 +225,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                  ops::SumOpVarTypeInference);
 REGISTER_OP_CPU_KERNEL(
    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,

--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
                ->set_lod(inside_tensor.lod());
          }
        }
        auto new_inside_name = cur_scope.Rename(inside_grad_name);
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+            {{"Out", {pg_names[param_id]}}},
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -99,5 +99,11 @@ inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
      memory.get_primitive_desc().desc().data.format);
 }
+inline mkldnn::memory::format GetMKLDNNFormat(
+    const mkldnn::sum::primitive_desc& memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.dst_primitive_desc().desc().data.format);
+}
 }  // namespace platform
 }  // namespace paddle
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var):
 class WeightedAverage(object):
+    """
+    Calculate weighted average.
+    The average calculating is accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+    Examples:
+        .. code-block:: python
+            avg = fluid.average.WeightedAverage()
+            avg.add(value=2.0, weight=1)
+            avg.add(value=4.0, weight=2)
+            avg.eval()
+            # The result is 3.333333333.
+            # For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
+    """
    def __init__(self):
        warnings.warn(
            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs):
    for idx, op_desc in enumerate(op_descs):
        for var_name in op_desc.input_arg_names():
            if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append(
+                pending_sum_ops.append((_create_op_desc_(
-                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
-                                      {"Out": [var_name]}, {}), idx))
+                    {"use_mkldnn": False}), idx))
                renamed_vars[var_name] = [var_name]
        for var_name in op_desc.output_arg_names():
            if var_name == core.empty_var_name(
@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
            else:
                if len(renamed_vars[var_name]) == 1:
                    new_name = var_name + "@RENAME@" + \
-                               str(var_rename_count[var_name])
+                        str(var_rename_count[var_name])
                    var_rename_count[var_name] += 1
                    # rename original var_name
                    renamed_vars[var_name][0] = new_name
@@ -155,14 +155,15 @@ def _addup_repetitive_outputs_(op_descs):
                    _rename_arg_(pending_sum_ops, var_name, new_name)
                new_name = var_name + "@RENAME@" + \
-                           str(var_rename_count[var_name])
+                    str(var_rename_count[var_name])
                var_rename_count[var_name] += 1
                op_desc.rename_output(var_name, new_name)
                renamed_vars[var_name].append(new_name)
    for var_name, inputs in renamed_vars.iteritems():
        if len(inputs) > 1:
-            pending_sum_ops.append((_create_op_desc_(
+            pending_sum_ops.append(
-                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
+                                  {"use_mkldnn": False}), len(op_descs)))
    # sum_op descs are sorted according to their insert position
    for p in reversed(pending_sum_ops):
        op_descs.insert(p[1], p[0])
@@ -434,18 +435,65 @@ def _get_stop_gradients_(program):
 def append_backward(loss, parameter_list=None, no_grad_set=None,
                    callbacks=None):
    """
-    Append backward part to main_program
+    Append backward part to main_program.
-    Args:
+    A complete neural network training is made up of forward and backward 
-        loss(Variable): The variable generated by cost function.
+    propagation. However, when we configure a network, we only need to 
-        parameter_list(list[string]): Parameters that need to be updated by
+    specify its forwrd part. The backward part is generated automatically 
-            optimizer. If None, it means all parameters need to be updated.
+    according to the forward part by this function.
-        no_grad_set(set): Variables that have no gradients in Block 0.
-            All variables with `step_gradient=True` from all blocks will be
-            automatically added.
-    Return:
+    In most cases, users do not need to invoke this function manually. It 
-        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
+    will be automatically invoked by the optimizer's `minimize` function.
+    Args:
+        loss(Variable): The loss variable of the network.
+        parameter_list(list[string]|None): Names of parameters that need 
+                                           to be updated by optimizers. 
+                                           If it is None, all parameters 
+                                           will be updated.
+                                           Default: None
+        no_grad_set(set|None): Variables in the Block 0 whose gradients 
+                               should be ignored. All variables with 
+                               `step_gradient=True` from all blocks will 
+                               be automatically added into this set.
+                               Default: None
+        callbacks(list[callable object]|None): The callbacks are used for 
+                                               doing some custom jobs during 
+                                               backward part building. All 
+                                               callable objects in it will 
+                                               be invoked once each time a 
+                                               new gradient operator is added 
+                                               into the program. The callable 
+                                               object must has two input 
+                                               parameters: 'block' and 'context'. 
+                                               The 'block' is the block which 
+                                               the new gradient operator will 
+                                               be added to. The 'context' is a 
+                                               map, whose keys are gradient 
+                                               variable names and values are 
+                                               corresponding original variables.
+                                               In addition to this, the 'context' 
+                                               has another special key-value pair: 
+                                               the key is string '__current_op_desc__' 
+                                               and the value is the op_desc of the 
+                                               gradient operator who has just 
+                                               triggered the callable object. 
+    Returns:
+        list[(Variable,Variable)]: Pairs of parameter and its 
+        corresponding gradients. The key is the parameter and the 
+        value is gradient variable.
+    Raises:
+        AssertionError: If `loss` is not an instance of Variable.
+    Examples:
+        .. code-block:: python
+            # network configuration code
+            # ...
+            avg_loss = fluid.layers.mean(loss)
+            param_grad_list = fluid.backward.append_backward(loss=avg_loss)
    """
    assert isinstance(loss, framework.Variable)

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -24,8 +24,6 @@ __all__ = [
    'GradientClipByValue',
    'GradientClipByNorm',
    'GradientClipByGlobalNorm',
-    'append_gradient_clip_ops',
-    'error_clip_callback',
 ]
@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
 class ErrorClipByValue(BaseErrorClipAttr):
+    """
+    Clips tensor values to the range [min, max].
+    Given a tensor t, this operation clips its value to min and max inplace.
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to -max by framework.
+    Examples:
+        .. code-block:: python
+            var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+    """
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
 class GradientClipByValue(BaseGradientClipAttr):
+    """
+    Clips gradient values to the range [min, max].
+    Given a tensor t, this operation clips its value to min and max inplace.
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to -max by framework.
+    Examples:
+        .. code-block:: python
+            w_param_attrs = ParamAttr(name=None,
+              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+              learning_rate=1.0,
+              regularizer=L1Decay(1.0),
+              trainable=True,
+              clip=GradientClipByValue(-1.0, 1.0))
+            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+    """
    def __init__(self, max, min=None):
        max = float(max)
        if min is None:
@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
 class GradientClipByNorm(BaseGradientClipAttr):
+    """
+    Clips tensor values to a maximum L2-norm.
+    This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
+    If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
+    will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
+    :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
+    :math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
+    .. math::
+        Out = \\frac{max\_norm * X}{norm(X)},
+    where :math:`norm(X)` represents the L2 norm of :math:`X`.
+    Args:
+        clip_norm (float): The maximum norm value
+    Examples:
+        .. code-block:: python
+            w_param_attrs = ParamAttr(name=None,
+              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+              learning_rate=1.0,
+              regularizer=L1Decay(1.0),
+              trainable=True,
+              clip=GradientClipByNorm(clip_norm=2.0))
+            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+    """
    def __init__(self, clip_norm):
        self.clip_norm = clip_norm
@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr):
 class GradientClipByGlobalNorm(BaseGradientClipAttr):
+    """
+    Clips values of multiple tensors by the ratio of the sum of their norms.
+    Given a list of tensors t_list, and a clipping ratio clip_norm, this
+    operation returns a list of clipped tensors list_clipped and the global
+    norm (global_norm) of all tensors in t_list.
+    To perform the clipping, the values :math:`t\_list[i]` are set to:
+    .. math::
+        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+    where:
+    .. math::
+        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
+    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
+    otherwise they're all shrunk by the global ratio.
+    Args:
+        clip_norm (float): The maximum norm value
+        group_name (str, optional): The group name for this clip.
+    Examples:
+        .. code-block:: python
+            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
+            with fluid.program_guard(main_program=prog_clip):
+                fluid.clip.set_gradient_clip(
+                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
+                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+    """
    def __init__(self, clip_norm, group_name="default_group"):
        if not isinstance(group_name, basestring):
            raise TypeError("'group_name' must be a basestring.")
@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 def set_gradient_clip(clip, param_list=None, program=None):
    """
-        To specify parameters that require gradient clip.
+    To specify parameters that require gradient clip.
-        Args:
-            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
+    Args:
-                    which describes the type and detailed attributes of required gradient clip.
+        clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
-            param_list(list, None by default): Parameters that require gradient clip. 
+                which describes the type and detailed attributes of required gradient clip.
-                    It can be a list of parameter or a list of parameter's name. 
+        param_list(list(Variable)): Parameters that require gradient clip.
-                    When it's None, all parameters in the program will be included. 
+                It can be a list of parameter or a list of parameter's name.
-            program(Program, None by default): The program where parameters are. 
+                When it's None, all parameters in the program will be included.
-                    Will be the default main program when assigned with None.
+        program(Program): The program where parameters are.
+                Will be the default main program when assigned with None.
    """
    if not isinstance(clip, BaseGradientClipAttr):
        raise TypeError(

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object):
        self.place = place
        self.lod_level = lod_level
        self.shape = shape
+        negtive_count = 0
+        for s in self.shape:
+            if s < 0:
+                negtive_count += 1
+            if negtive_count > 1:
+                self.shape = None
+                break
        if dtype == core.VarDesc.VarType.FP32:
            self.dtype = 'float32'
        elif dtype == core.VarDesc.VarType.INT64:
@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object):
                self._feed_impl_(each_data, lod[1:], lod_level - 1)
    def done(self):
-        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        arr = numpy.array(self.data, dtype=self.dtype)
+        if self.shape:
+            arr = arr.reshape(self.shape)
        t = core.LoDTensor()
        t.set(arr, self.place)
        if self.lod_level > 0:

--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ]
 class Inferencer(object):
+    """
+    Inferencer High Level API.
+    Args:
+        infer_func (Python func): Infer function that will return predict Variable
+        param_path (str): The path where the inference model is saved by fluid.io.save_params
+        place (Place): place to do the inference
+        parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
+    Examples:
+        .. code-block:: python
+            def inference_program():
+                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                return y_predict
+            place = fluid.CPUPlace()
+            inferencer = fluid.Inferencer(
+                infer_func=inference_program, param_path="/tmp/model", place=place)
+    """
    def __init__(self, infer_func, param_path, place=None, parallel=False):
-        """
-        :param infer_func: a function that will return predict Variable
-        :param param_path: the path where the inference model is saved by fluid.io.save_params
-        :param place: place to do the inference
-        :param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
-        """
        self.param_path = param_path
        self.scope = core.Scope()
        self.parallel = parallel
@@ -60,9 +77,20 @@ class Inferencer(object):
    def infer(self, inputs, return_numpy=True):
        """
-        :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
+        Do Inference for Inputs
-        to get the predict value
-        :return: the predict value of the inference model
+        Args:
+            inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
+            return_numpy (bool): transform return value into numpy or not
+        Returns:
+            Tensor or Numpy: the predict value of the inference model for the inputs
+        Examples:
+            .. code-block:: python
+                tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+                results = inferencer.infer({'x': tensor_x})
        """
        if not isinstance(inputs, dict):
            raise ValueError(

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -19,26 +19,39 @@ from framework import convert_np_dtype_to_dtype_
 from core import VarDesc
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'force_init_on_cpu',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
-    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
+    'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
-    'NormalInitializer', 'XavierInitializer', 'BilinearInitializer'
+    'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
+    'BilinearInitializer', 'MSRAInitializer'
 ]
 _force_init_on_cpu_ = False
 def force_init_on_cpu():
+    """
+    The flag of whether force to init variables on CPU.
+    Examples:
+        .. code-block:: python
+            if force_init_on_cpu():
+                pass
+    """
    return _force_init_on_cpu_
 @contextlib.contextmanager
 def init_on_cpu():
    """
-    Switch program with `with` statement
+    Force the variable to be inited on CPU.
    Examples:
-        >>> with init_on_cpu():
+        .. code-block:: python
-        >>>   step = layers.create_global_var()
+            with init_on_cpu():
+                step = layers.create_global_var()
    """
    global _force_init_on_cpu_
@@ -104,14 +117,18 @@ class Initializer(object):
 class ConstantInitializer(Initializer):
    """Implements the constant initializer
+    Args:
+        value (float): constant value to initialize the variable
+    Examples:
+        .. code-block:: python
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Constant(value=2.0))
    """
    def __init__(self, value=0.0, force_cpu=False):
-        """Constructor for ConstantInitializer
-        Args:
-            value: constant value to initialize the variable
-        """
        assert value is not None
        super(ConstantInitializer, self).__init__()
        self._value = value
@@ -146,16 +163,20 @@ class ConstantInitializer(Initializer):
 class UniformInitializer(Initializer):
    """Implements the random uniform distribution initializer
+    Args:
+        low (float): lower boundary of the uniform distribution
+        high (float): upper boundary of the uniform distribution
+        seed (int): random seed
+    Examples:
+        .. code-block:: python
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
    """
    def __init__(self, low=-1.0, high=1.0, seed=0):
-        """Constructor for UniformInitializer
-        Args:
-            low: lower boundary of the uniform distribution
-            high: upper boundary of the uniform distribution
-            seed: random seed
-        """
        assert low is not None
        assert high is not None
        assert high >= low
@@ -196,17 +217,21 @@ class UniformInitializer(Initializer):
 class NormalInitializer(Initializer):
-    """Implements the  random Normal(Gaussian) distribution initializer
+    """Implements the Random Normal(Gaussian) distribution initializer
+    Args:
+        loc (float): mean of the normal distribution
+        scale (float): standard deviation of the normal distribution
+        seed (int): random seed
+    Examples:
+        .. code-block:: python
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
    """
    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        """Constructor for NormalInitializer
-        Args:
-            loc: mean of the normal distribution
-            scale: standard deviation of the normal distribution
-            seed: random seed
-        """
        assert loc is not None
        assert scale is not None
        assert seed is not None
@@ -246,39 +271,49 @@ class NormalInitializer(Initializer):
 class XavierInitializer(Initializer):
-    """Implements the Xavier initializer
+    """
    This class implements the Xavier weight initializer from the paper
-    Understanding the difficulty of training deep feedforward neural
+    `Understanding the difficulty of training deep feedforward neural
-    networks[1] by Xavier Glorot and Yoshua Bengio.
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
    This initializer is designed to keep the scale of the gradients
    approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)).
+    the range is [-x, x], where
+    .. math::
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
    In case of Normal distribution, the mean is 0 and the standard deviation
-    is sqrt(2/ (fan_in + fan_out)).
+    is
+    .. math::
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+    Args:
+        uniform (bool): whether to use uniform or normal distribution
+        fan_in (float): fan_in for Xavier initialization. If None, it is
+                inferred from the variable.
+        fan_out (float): fan_out for Xavier initialization. If None, it is
+                 inferred from the variable.
+        seed (int): random seed
+    Note:
+        It is recommended to set fan_in and fan_out to None for most cases.
+    Examples:
+        .. code-block:: python
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.Xavier(uniform=False))
-    References:
-        [1] Understanding the difficulty of training deep feedforward neural
-            networks. International conference on artificial intelligence and
-            statistics.
-            (http://proceedings.mlr.press/v9/glorot10a.html)
    """
    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
-        """Constructor for XavierInitializer
-        Args:
-            uniform: whether to use uniform or normal distribution
-            fan_in: fan_in for Xavier initialization. If None, it is
-                    inferred from the variable.
-            fan_out: fan_out for Xavier initialization. If None, it is
-                     inferred from the variable.
-            seed: random seed
-        Note: It is recommended to set fan_in and fan_out to None for
-              most cases.
-        """
        assert uniform is not None
        assert seed is not None
        super(XavierInitializer, self).__init__()
@@ -342,30 +377,42 @@ class MSRAInitializer(Initializer):
    """Implements the MSRA initializer a.k.a. Kaiming Initializer
    This class implements the weight initialization from the paper
-    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
-    and Jian Sun. This is a robust initialization method that particularly
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
-    considers the rectifier nonlinearities. In case of Uniform distribution,
+    robust initialization method that particularly considers the rectifier
-    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
-    distribution, the mean is 0 and the standard deviation
-    is sqrt(2/ fan_in).
+    .. math::
-    References:
+        x = \sqrt{\\frac{6.0}{fan\_in}}
-        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
-            on ImageNet Classification
+    In case of Normal distribution, the mean is 0 and the standard deviation
-            (https://arxiv.org/abs/1502.01852)
+    is
+    .. math::
+        \sqrt{\\frac{2.0}{fan\_in}}
+    Args:
+        uniform (bool): whether to use uniform or normal distribution
+        fan_in (float): fan_in for MSRAInitializer. If None, it is\
+        inferred from the variable.
+        seed (int): random seed
+    Note:
+        It is recommended to set fan_in to None for most cases.
+    Examples:
+        .. code-block:: python
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.MSRA(uniform=False))
    """
    def __init__(self, uniform=True, fan_in=None, seed=0):
        """Constructor for MSRAInitializer
-        Args:
-            uniform: whether to use uniform or normal distribution
-            fan_in: fan_in for MSRAInitializer. If None, it is
-                    inferred from the variable.
-            seed: random seed
-        Note: It is recommended to set fan_in to None for most cases.
        """
        assert uniform is not None
        assert seed is not None
@@ -425,34 +472,37 @@ class MSRAInitializer(Initializer):
 class BilinearInitializer(Initializer):
-    """Implements the bilinear initializer.
+    """
    This initializer can be used in transposed convolution operator to
    act as upsampling. Users can upsample a feature map with shape of
    (B, C, H, W) by any integer factor. The usage is:
-    >>>  factor = 2
+    Examples:
-    >>>  w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
-    >>>                     initializer=Bilinear())
+        .. code-block:: python
-    >>>  conv_up = fluid.layers.conv2d_transpose(
-    >>>      input,
+            factor = 2
-    >>>      num_filters=C,
+            w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
-    >>>      output_size=None,
+                               initializer=Bilinear())
-    >>>      filter_size=2 * factor - factor % 2,
+            conv_up = fluid.layers.conv2d_transpose(
-    >>>      padding=ceil((factor - 1) / 2.),
+                input,
-    >>>      stride=factor,
+                num_filters=C,
-    >>>      groups=C,
+                output_size=None,
-    >>>      param_attr=w_attr,
+                filter_size=2 * factor - factor % 2,
-    >>>      bias_attr=False)
+                padding=ceil((factor - 1) / 2.),
+                stride=factor,
+                groups=C,
-    Where, `num_filters=C` and `groups=C` means this is channel-wise tranposed
+                param_attr=w_attr,
+                bias_attr=False)
+    Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
    This initializer will set a (K, K) interpolation kernel for every channel
    of the filter identically. The resulting shape of the output feature map
    will be (B, C, factor * H, factor * W). Note that the learning rate and the
    weight decay are set to 0 in order to keep coefficient values of bilinear
-    interpolation unchanged during training. 
+    interpolation unchanged during training.
    """
    def __init__(self):
@@ -469,7 +519,7 @@ class BilinearInitializer(Initializer):
                           be added.
        Returns:
-            the initialization op
+            Operator: the initialization op
        Raises:
            ValueError: If type of `var` and `block` is not right.

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -30,20 +30,42 @@ __all__ = [
 def is_parameter(var):
-    """Check whether the variable is a Parameter.
+    """
+    Check whether the given variable is an instance of Parameter.
-    This function checks whether the input variable is a Parameter.
    Args:
-        var : The input variable.
+        var(Variable): The variable to be checked.
    Returns:
-        boolean result whether the variable is a Parameter.
+        bool: True if the given `var` is an instance of Parameter,
+        False if not.
+    Examples:
+        .. code-block:: python
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_parameter(param)
    """
    return isinstance(var, Parameter)
 def is_persistable(var):
+    """
+    Check whether the given variable is persistable.
+    Args:
+        var(Variable): The variable to be checked.
+    Returns:
+        bool: True if the given `var` is persistable
+        False if not.
+    Examples:
+        .. code-block:: python
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_persistable(param)
+    """
    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
            var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
        return False
@@ -68,20 +90,69 @@ def save_vars(executor,
              predicate=None,
              filename=None):
    """
-    Save variables to directory by executor.
+    Save variables to the given directory by executor.
+    There are two ways to specify variables to be saved: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be saved. The first way has a higher priority. In other words, if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
-    :param executor: executor that save variable
+    The `dirname` are used to specify the folder where to save variables. 
-    :param dirname: directory path
+    If you prefer to save variables in separate files in the folder `dirname`, 
-    :param main_program: program. If vars is None, then filter all variables in this
+    set `filename` None; if you prefer to save all variables in a single file, 
-    program which fit `predicate`. Default default_main_program.
+    use `filename` to specify it.
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be saved.
-    :param vars: variables need to be saved. If vars is specified, program & predicate
-    will be ignored
-    :param filename: The name of a single file that all vars are saved to.
-        If it is None, save variables to separate files.
-    :return: None
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be saved. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to save. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be saved. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which to save all variables. If you prefer to save 
+                            variables separately, set it to None.
+                            Default: None
+    Returns:
+        None
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
+            prog = fluid.default_main_program()
+            fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be saved.
+            # And variables are going to be saved separately.
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be saved. And they are going to be
+            # saved in the same file named 'var_file' in the path "./my_paddle_model".
    """
    if vars is None:
        if main_program is None:
@@ -129,7 +200,42 @@ def save_vars(executor,
 def save_params(executor, dirname, main_program=None, filename=None):
    """
-    Save all parameters to directory with executor.
+    This function filters out all parameters from the give `main_program`
+    and then save them to the folder `dirname` or the file `filename`.
+    Use the `dirname` to specify the saving folder. If you would like to 
+    save parameters in separate files, set `filename` None; if you would 
+    like to save all parameters in a single file, use `filename` to specify 
+    the file name.
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead.
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    saved. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to save all parameters. If you prefer 
+                            to save parameters in differnet files, set it 
+                            to None.
+                            Default: None
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_params(executor=exe, dirname=param_path, 
+                                 main_program=None)
    """
    save_vars(
        executor,
@@ -142,7 +248,37 @@ def save_params(executor, dirname, main_program=None, filename=None):
 def save_persistables(executor, dirname, main_program=None, filename=None):
    """
-    Save all persistables to directory with executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then saves these variables to the folder `dirname` 
+    or file `filename`.
+    The `dirname` is used to specify the folder where persistable variables 
+    are going to be saved. If you would like to save variables in separate 
+    files, set `filename` None; if you would like to save all variables in a 
+    single file, use `filename` to specify the file name.
+    Args:
+        executor(Executor): The executor to run for saving persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be saved. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to saved all variables. If you prefer to 
+                            save variables in differnet files, set it to None.
+                            Default: None
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
    """
    save_vars(
        executor,
@@ -160,20 +296,69 @@ def load_vars(executor,
              predicate=None,
              filename=None):
    """
-    Load variables from directory by executor.
+    Load variables from the given directory by executor.
+    There are two ways to specify variables to be loaded: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be loaded. The first way has a higher priority. In other words if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
+    The `dirname` are used to specify the folder where to load variables. 
+    If variables were saved in separate files in the folder `dirname`, 
+    set `filename` None; if all variables were saved in a single file, 
+    use `filename` to specify it.
-    :param executor: executor that load variable
+    Args:
-    :param dirname: directory path
+        executor(Executor): The executor to run for loading variables.
-    :param main_program: program. If vars is None, then filter all variables in this
+        dirname(str): The directory path.
-    program which fit `predicate`. Default default_main_program().
+        main_program(Program|None): The program whose variables will be loaded. 
-    :param predicate: The Predicate describes a callable that returns a variable
+                                    If it is None, the default main program will 
-    as a bool. If it returns true, the corresponding input variable will be loaded.
+                                    be used automatically.
-    :param vars: variables need to be loaded. If vars is specified, program &
+                                    Default: None
-    predicate will be ignored
+        vars(list[Variable]|None): The list that contains all variables to load. 
-    :param filename: The name of the single file that all vars are loaded from.
+                                   It has a higher priority than the `main_program`.
-        If it is None, load variables from separate files.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be loaded. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which saved all required variables. If variables 
+                            were saved in differnet files, set it to None.
+                            Default: None
+    Returns:
+        None
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
-    :return: None
+            prog = fluid.default_main_program()
+            fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be loaded.
+            # And all the variables are supposed to have been saved in differnet files.
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # been saved in the same file named 'var_file' in the path "./my_paddle_model".
    """
    if vars is None:
        if main_program is None:
@@ -221,7 +406,42 @@ def load_vars(executor,
 def load_params(executor, dirname, main_program=None, filename=None):
    """
-    load all parameters from directory by executor.
+    This function filters out all parameters from the give `main_program`
+    and then trys to load these parameters from the folder `dirname` or
+    the file `filename`.
+    Use the `dirname` to specify the folder where parameters were saved. If 
+    parameters were saved in separate files in the folder `dirname`, set 
+    `filename` None; if all parameters were saved in a single file, use 
+    `filename` to specify the file name.
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead. 
+    Args:
+        executor(Executor): The executor to run for loading parameters.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    loaded. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all parameters. If parameters 
+                            were saved in differnet files, set it to None.
+                            Default: None
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_params(executor=exe, dirname=param_path, 
+                                main_program=None)
    """
    load_vars(
        executor,
@@ -233,7 +453,37 @@ def load_params(executor, dirname, main_program=None, filename=None):
 def load_persistables(executor, dirname, main_program=None, filename=None):
    """
-    load all persistables from directory by executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then trys to load these variables from the folder 
+    `dirname` or the file `filename`.
+    Use the `dirname` to specify the folder where persistable variables were 
+    saved. If variables were saved in separate files, set `filename` None; 
+    if all variables were saved in a single file, use `filename` to specify 
+    the file name.
+    Args:
+        executor(Executor): The executor to run for loading persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be loaded. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all variables. If variables were 
+                            saved in differnet files, set it to None.
+                            Default: None
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
    """
    load_vars(
        executor,
@@ -306,22 +556,48 @@ def save_inference_model(dirname,
                         model_filename=None,
                         params_filename=None):
    """
-    Build a model especially for inference,
+    Prune the given `main_program` to build a new program especially for inference,
-    and save it to directory by the executor.
+    and then save it and all related parameters to given `dirname` by the `executor`.
+    Args:
+        dirname(str): The directory path to save the inference model.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference 
+                                     results.
+        executor(Executor): The executor that saves the inference model.
+        main_program(Program|None): The original program, which will be pruned to 
+                                    build the inference model. If is setted None, 
+                                    the default main program will be used.
+                                    Default: None.
+        model_filename(str|None): The name of file to save the inference program 
+                                  itself. If is setted None, a default filename 
+                                  `__model__` will be used.
+        params_filename(str|None): The name of file to save all related parameters. 
+                                   If it is setted None, parameters will be saved 
+                                   in separate files .
-    :param dirname: directory path
+    Returns:
-    :param feeded_var_names: Names of variables that need to be feeded data during inference
+        None
-    :param target_vars: Variables from which we can get inference results.
-    :param executor: executor that save inference model
+    Raises:
-    :param main_program: original program, which will be pruned to build the inference model.
+        ValueError: If `feed_var_names` is not a list of basestring.
-            Default default_main_program().
+        ValueError: If `target_vars` is not a list of Variable.
-    :param model_filename: The name of file to save inference program.
-        If not specified, default filename `__model__` will be used.
+    Examples:
-    :param params_filename: The name of file to save parameters.
+        .. code-block:: python
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
+                         target_vars=[predict_var], executor=exe)
+            # In this exsample, the function will prune the default main program 
+            # to make it suitable for infering the `predict_var`. The pruned 
+            # inference program is going to be saved in the "./infer_model/__model__" 
+            # and parameters are going to be saved in separate files under folder
+            # "./infer_model". 
-    :return: None
    """
    if isinstance(feeded_var_names, basestring):
        feeded_var_names = [feeded_var_names]
@@ -382,18 +658,49 @@ def load_inference_model(dirname,
    """
    Load inference model from a directory
-    :param dirname: directory path
+    Args:
-    :param executor: executor that load inference model
+        dirname(str): The directory path
-    :param model_filename: The name of file to load inference program.
+        executor(Executor): The executor to run for loading inference model.
-        If not specified, default filename `__model__` will be used.
+        model_filename(str|None): The name of file to load inference program.
-    :param params_filename: The name of file to load parameters.
+                                  If it is None, the default filename 
-        It is used for the case that all parameters are saved in a single binary file.
+                                  '__model__' will be used.
-        If not specified, parameters are considered saved in separate files.
+                                  Default: None
+        params_filename(str|None): The name of file to load all parameters.
+                                   It is only used for the case that all 
+                                   parameters were saved in a single binary 
+                                   file. If parameters were saved in separate 
+                                   files, set it as 'None'.
+    Returns:
+        tuple: The return of this function is a tuple with three elements:
+        (program, feed_target_names, fetch_targets). The `program` is a 
+        Program, it's the program for inference. The `feed_target_names` is 
+        a list of str, it contains Names of variables that need to feed 
+        data in the inference program. The `fetch_targets` is a list of 
+        Variable. It contains variables from which we can get inference 
+        results.
+    Raises:
+        ValueError: If `dirname` is not a existing directory.
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            [inference_program, feed_target_names, fetch_targets] = 
+                fluid.io.load_inference_model(dirname=path, executor=exe)
+            results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+            # In this exsample, the inference program was saved in the 
+            # "./infer_model/__model__" and parameters were saved in 
+            # separate files in ""./infer_model". 
+            # After getting inference program, feed target names and 
+            # fetch targets, we can use an Executor to run the inference 
+            # program to get the inference result.
-    :return: [program, feed_target_names, fetch_targets]
-             program: program especially for inference.
-             feed_target_names: Names of variables that need to feed data
-             fetch_targets: Variables from which we can get inference results.
    """
    if not os.path.isdir(dirname):
        raise ValueError("There is no directory named '%s'", dirname)
@@ -424,12 +731,25 @@ def load_inference_model(dirname,
 def get_parameter_value(para, executor):
    """
-    Get the LoDTensor for the parameter
+    Get the LoDTensor value of the given parameter.
+    Args:
+        para(Parameter): The parameter to get value from.
+        executor(Executor): The executor to run for retrieving the value.
+    Returns:
+        numpy.array: The given parameter's values.
+    Raises:
+        AssertionError: If the `para` is not an instance of Parameter.
-    :param executor: executor for retrieving the value
+    Examples:
-    :param para: the given parameter
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param = fluid.default_main_program().global_block().var('fc.w')
+            p = fluid.io.get_parameter_value(param, exe)
-    :return: the LoDTensor for the parameter
    """
    assert is_parameter(para)
@@ -441,14 +761,30 @@ def get_parameter_value(para, executor):
 def get_parameter_value_by_name(name, executor, program=None):
    """
-    Get the LoDTensor for paramter with the given name
+    Get the LoDTensor value of a certain parameter by its name.
+    Args:
+        name(str): The parameter's name.
+        executor(Executor): The executor to run for retrieving the value.
+        program(Program | None): The program where to find the parameter.
+                               If it's set to be None, the function will
+                               try to find the parameter in the default
+                               main program.
-    :param executor: executor for retrieving the value
+    Returns:
-    :param name: the name of the parameter
+        numpy.array: The parameter's values.
-    :param program: the program where the variable is found
-            Default default_main_program().
-    :return: the LoDTensor for the variable
+    Raises:
+        TypeError: If given `name` is not an instance of basestring.
+        TypeError: If the parameter with the given name doesn't exist.
+        AssertionError: If there is a varibale named `name` in the
+                        given program but it is not a Parameter.
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            p = fluid.io.get_parameter_value('fc.w', exe)
    """
    if program is None:
        program = default_main_program()
@@ -470,16 +806,58 @@ def save_checkpoint(executor,
                    main_program=None,
                    max_num_checkpoints=3):
    """
-    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
+    This function filters out all checkpoint variables from the give
-    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
+    main_program and then saves these variables to the `checkpoint_dir` 
-    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
+    directory.
-    The interval between two saved checkpoints must greater than save_interval_secs.
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the 
+    `checkpoint_dir`. To avoid them taking too much disk space, the 
+    `max_num_checkpoints` are introduced to limit the total number of 
+    checkpoints. If the number of existing checkpints is greater than 
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-    :param executor executor for save the value
+    Args:
-    :param checkpoint_dir the checkpoint directory 
+        executor(Executor): The executor to run for save checkpoint.
-    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
+        checkpoint_dir(str): The folder where to save checkpoints.
-    :param main_program   will save all variables in program 
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
-    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
+            and 'step_id'.
+            Defaut: None
+        main_program(Program|None): The program whose checkpoint variables will
+            be saved. If it is None, the default main program will be used.
+        max_num_checkpoints(int): The max number of total number of existing 
+            checkpoints.
+            Default: 3
+    Returns:
+        None
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            fluid.io.save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3)
    """
    if checkpoint_dir is None:
        raise ValueError("'checkpoint_dir' should not be None")
@@ -503,13 +881,50 @@ def save_checkpoint(executor,
 def load_checkpoint(executor, checkpoint_dir, serial, main_program):
    """
-    Load checkpoint from a directory by executor,
+    This function filters out all checkpoint variables from the give
-    it will find  the most recent saved checkpoint file and load it auto.
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the 
+    `checkpoint_dir` (each checkpoint has its own sub folder), use 
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.
-    :param executor executor for load the value
+    Returns:
-    :param checkpoint_dir  the checkpoint directory 
+        None
-    :param serial the serial folder in checkpoint directory will be load
-    :param main_program  will load all variables in program 
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `serial` is None or `serial` is less than 0.
+        ValueError: If `main_program` is None.
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
    """
    if checkpoint_dir is None:
@@ -528,10 +943,10 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
    """
    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.
-    :param checkpoint_dir
+    : param checkpoint_dir
-    :param delete_dir
+    : param delete_dir
    """
    if checkpoint_dir is None:
@@ -547,13 +962,40 @@ def load_persist_vars_without_grad(executor,
                                   program,
                                   has_model_dir=False):
    """
-    load_persist_vars_without_grad will load variables from a directory by an executor,
+    This function filters out all checkpoint variables from the give
-    the variable named end with "@GRAD" will not be loaded.
+    program and then trys to load these variables from the given directory.
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-    :param executor  executor for load the value
+    Args:
-    :param dirname the checkpoint directory 
+        executor(Executor): The executor to run for loading variables.
-    :param program   will load all variables in program 
+        dirname(str): The directory path.
-    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+            # In this example, `load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
    """
    if has_model_dir:
@@ -569,12 +1011,38 @@ def load_persist_vars_without_grad(executor,
 def save_persist_vars_without_grad(executor, dirname, program):
    """
-    save_persist_vars_without_grad  will save variables to a directory by an executor,
+    This function filters out all checkpoint variables from the give
-    the variable named end with "@GRAD" will not be saved.
+    program and then save these variables to a sub-folder '__model__' of 
+    the given directory.
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)
-    :param executor  executor for load the value
+            # In this example, `save_persist_vars_without_grad` function
-    :param dirname the checkpoint directory 
+            # will first filters out all checkpoint variables in the default
-    :param program   will load all variables in program
+            # main program, and then saves these variables to the folder 
+            # "./my_paddle_model/__model__".
    """
    cur_dir = _get_model_dir(dirname)
    save_vars(
@@ -620,7 +1088,7 @@ def _is_checkpoint_var(var):
    the checkpoint will not save or load all the variables.
    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-    :param var
+    : param var
    """
    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
@@ -701,7 +1169,7 @@ def _write_success(dirname):
    """
    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
-    :param dirname
+    : param dirname
    """
    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
    with open(success_file, 'a') as f:
@@ -713,7 +1181,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
    """
    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
-    :param checkpoint_dir
+    : param checkpoint_dir
    """
    if not checkpoint_dir:
        return -1

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -185,12 +185,14 @@ def Print(input,
    Returns:
        Variable: Output tensor, same data with input tensor.
    Examples:
        .. code-block:: python
-        value = some_layer(...)
+           value = some_layer(...)
-        Print(value, summarize=10,
+           Print(value, summarize=10,
-              message="The content of some_layer: ")
+               message="The content of some_layer: ")
    '''
    helper = LayerHelper('print', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):
 class ConditionalBlock(object):
+    '''
+    **ConditionalBlock**
+    ConditionalBlock is an operator that bind a block to a specific condition,
+    if the condition matches, the corresponding block will be executed.
+    Args:
+        inputs (Variable): bool conditions.
+        is_scalar_condition (bool): whether the branch is controled by a scalar.
+        name(str): name of this ConditionalBlock.
+    Examples:
+        .. code-block:: python
+             cond = layers.less_than(x=label, y=limit)
+             true_image, false_image = layers.split_lod_tensor(
+                 input=image, mask=cond)
+             true_cond = layers.ConditionalBlock([true_image])
+             with true_cond.block():
+                 ...
+             with false_cond.block():
+                 ...
+    '''
    def __init__(self, inputs, is_scalar_condition=False, name=None):
        for each_input in inputs:
            if not isinstance(each_input, Variable):

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
 """
 from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 import tensor
 import nn
@@ -155,7 +155,7 @@ def detection_output(loc,
    return nmsed_outs
-@autodoc()
+@templatedoc()
 def detection_map(detect_res,
                  label,
                  class_num,
@@ -166,6 +166,47 @@ def detection_map(detect_res,
                  input_states=None,
                  out_states=None,
                  ap_version='integral'):
+    """
+    ${comment}
+    Args:
+        detect_res: ${detect_res_comment}
+        label:  ${label_comment}
+        class_num: ${class_num_comment}
+        background_label: ${background_label_comment}
+        overlap_threshold: ${overlap_threshold_comment}
+        evaluate_difficult: ${evaluate_difficult_comment}
+        has_state: ${has_state_comment}
+        input_states: If not None, It contains 3 elements:
+            1. pos_count ${pos_count_comment}.
+            2. true_pos ${true_pos_comment}.
+            3. false_pos ${false_pos_comment}.
+        out_states: If not None, it contains 3 elements.
+            1. accum_pos_count ${accum_pos_count_comment}.
+            2. accum_true_pos ${accum_true_pos_comment}.
+            3. accum_false_pos ${accum_false_pos_comment}.
+        ap_version: ${ap_type_comment}
+    Returns:
+        ${map_comment}
+    Examples:
+          .. code-block:: python
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            map_out = fluid.layers.detection_map(detect_res, label, 21)
+    """
    helper = LayerHelper("detection_map", **locals())
    def __create_var(type):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network. 
+All layers just related to the neural network.
 """
 from ..layer_helper import LayerHelper
@@ -93,6 +93,7 @@ __all__ = [
    'mean_iou',
    'relu',
    'log',
+    'crop',
 ]
@@ -108,14 +109,14 @@ def fc(input,
    """
    **Fully Connected Layer**
-    This function creates a fully connected layer in the network. It can take 
+    This function creates a fully connected layer in the network. It can take
-    multiple tensors as its inputs. It creates a variable called weights for 
+    multiple tensors as its inputs. It creates a variable called weights for
-    each input tensor, which represents a fully connected weight matrix from 
+    each input tensor, which represents a fully connected weight matrix from
-    each input unit to each output unit. The fully connected layer multiplies 
+    each input unit to each output unit. The fully connected layer multiplies
-    each input tensor with its coresponding weight to produce an output Tensor. 
+    each input tensor with its coresponding weight to produce an output Tensor.
-    If multiple input tensors are given, the results of multiple multiplications 
+    If multiple input tensors are given, the results of multiple multiplications
-    will be sumed up. If bias_attr is not None, a bias variable will be created 
+    will be sumed up. If bias_attr is not None, a bias variable will be created
-    and added to the output. Finally, if activation is not None, it will be applied 
+    and added to the output. Finally, if activation is not None, it will be applied
    to the output as well.
    This process can be formulated as follows:
@@ -197,7 +198,10 @@ def fc(input,
    else:
        pre_bias = helper.create_tmp_variable(dtype)
        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": use_mkldnn})
    # add bias
    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
    # add activation
@@ -846,7 +850,7 @@ def crf_decoding(input, param_attr, label=None):
    Returns:
        Variable: ${viterbi_path_comment}
    Examples:
        .. code-block:: python
@@ -1084,7 +1088,7 @@ def chunk_eval(input,
    Here is a NER example of labeling for these tagging schemes:
    .. code-block:: python
       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
              Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
@@ -1110,7 +1114,7 @@ def chunk_eval(input,
    is the num of chunk types, and `tag_type` get its value from the following table.
    .. code-block:: python
       Scheme Begin Inside End   Single
        plain   0     -      -     -
        IOB     0     1      -     -
@@ -1146,7 +1150,7 @@ def chunk_eval(input,
        tuple: tuple containing: precision, recall, f1_score,
        num_infer_chunks, num_label_chunks,
        num_correct_chunks
    Examples:
        .. code-block:: python
@@ -1246,7 +1250,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
    """
    This function computes the softmax activation among all time-steps for each
    sequence. The dimension of each time-step should be 1. Thus, the shape of
-    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N` 
+    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N`
    is the sum of the length of all sequences.
    For i-th sequence in a mini-batch:
@@ -1266,7 +1270,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
        param_attr (ParamAttr|None): attributes for parameter
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
        library is installed. Default: True
    Returns:
        Variable: output of sequence_softmax
@@ -1827,11 +1831,11 @@ def pool2d(input,
    ${comment}
    Args:
-        input (Variable): The input tensor of pooling operator. The format of 
+        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is 
+                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the 
+                          the number of channels, H is the height of the
                          feature, and W is the width of the feature.
-        pool_size (int): The side length of pooling windows. All pooling 
+        pool_size (int): The side length of pooling windows. All pooling
                         windows are squares with pool_size on a side.
        pool_type: ${pooling_type_comment}
        pool_stride (int): stride of the pooling layer.
@@ -1840,7 +1844,7 @@ def pool2d(input,
        use_cudnn: ${use_cudnn_comment}
        ceil_mode: ${ceil_mode_comment}
        use_mkldnn: ${use_mkldnn_comment}
-        name (str|None): A name for this layer(optional). If set None, the 
+        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.
    Returns:
@@ -1858,10 +1862,10 @@ def pool2d(input,
          data = fluid.layers.data(
              name='data', shape=[3, 32, 32], dtype='float32')
          conv2d = fluid.layers.pool2d(
-                            input=data, 
+                            input=data,
-                            pool_size=2, 
+                            pool_size=2,
-                            pool_type='max', 
+                            pool_type='max',
-                            pool_stride=1, 
+                            pool_stride=1,
                            global_pooling=False)
    """
    if pool_type not in ["max", "avg"]:
@@ -2226,14 +2230,14 @@ def beam_search_decode(ids, scores, name=None):
    This layers is to pack the output of beam search layer into sentences and
    associated scores. It is usually called after the beam search layer.
    Typically, the output of beam search layer is a tensor of selected ids, with
-    a tensor of the score of each id. Beam search layer's output ids, however, 
+    a tensor of the score of each id. Beam search layer's output ids, however,
-    are generated directly during the tree search, and they are stacked by each 
+    are generated directly during the tree search, and they are stacked by each
-    level of the search tree. Thus we need to reorganize them into sentences, 
+    level of the search tree. Thus we need to reorganize them into sentences,
    based on the score of each id. This layer takes the output of beam search
    layer as input and repack them into sentences.
    Args:
-        ids (Variable): The selected ids, output of beam search layer. 
+        ids (Variable): The selected ids, output of beam search layer.
        scores (Variable): The associated scores of the ids, out put of beam
            search layer.
        name (str): The name of this layer. It is optional.
@@ -2241,7 +2245,7 @@ def beam_search_decode(ids, scores, name=None):
    Returns:
        tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
        sentence_ids is a tensor with shape [size, length], where size is the
-        beam size of beam search, and length is the length of each sentence. 
+        beam size of beam search, and length is the length of each sentence.
        Note that the length of sentences may vary.
        sentence_scores is a tensor with the same shape as sentence_ids.
@@ -2674,18 +2678,35 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
    '''
+    **beam search**
    This function implements the beam search algorithm.
+    Beam search is a classical algorithm for selecting candidate words
+    in a machine translation task.
+    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
+    for more details.
    Args:
-        pre_ids (Variable): ${pre_ids_comment}
+        pre_ids (Variable): ids in previous step.
-        ids (Variable): ${ids_comment}
+        ids (Variable): a LoDTensor of shape of [None,k]
-        scores (Variable): ${scores_comment}
+        scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
-        beam_size (int): ${beam_size_comment}
+        beam_size (int): beam size for beam search
-        end_id (int): ${end_id_comment}
+        end_id (int): the token id which indicates the end of a sequence
-        level (int): ${level_comment}
+        level (int): the level of LoDTensor
    Returns:
-        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
+        tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
+    Examples:
+        .. code-block:: python
+             # current_score is a Tensor of shape (num_batch_size, embed_size), which
+             # consists score of each candidate word.
+             topk_scores, topk_indices = pd.topk(current_score, k=50)
+             selected_ids, selected_scores = pd.beam_search(
+                 pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
    '''
    helper = LayerHelper('beam_search', **locals())
    score_type = scores.dtype
@@ -2901,7 +2922,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
            `None`, compute the mean over all elements of :attr:`input`
            and return a variable with a single element, otherwise it
            must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is 
+            :math:`dim[i] < 0`, the dimension to reduce is
            :math:`rank(input) + dim[i]`.
        keep_dim (bool): Whether to reserve the reduced dimension in the
            output Tensor. The result tensor will have one fewer dimension
@@ -3372,16 +3393,16 @@ def topk(input, k, name=None):
    Args:
        input(Variable): The input variable which can be a vector or Tensor with
            higher rank.
-        k(int):  The number of top elements to look for along the last dimension 
+        k(int):  The number of top elements to look for along the last dimension
                 of input.
        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically. 
+                       will be named automatically.
                       Default: None
    Returns:
-        Tuple[Variable]: A tuple with two elements. Each element is a Variable. 
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable.
-        The first one is k largest elements along each last 
+        The first one is k largest elements along each last
-        dimensional slice. The second one is indices of values 
+        dimensional slice. The second one is indices of values
        within the last dimension of input.
    Raises:
@@ -3576,15 +3597,15 @@ def warpctc(input, label, blank=0, norm_by_times=False):
         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
         sequences' length and num_classes is the true number of classes.
         (not including the blank label).
-       label (Variable): The ground truth of variable-length sequence, 
+       label (Variable): The ground truth of variable-length sequence,
         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
         where Lg is th sum of all labels' length.
       blank (int, default 0): The blank label index of Connectionist
         Temporal Classification (CTC) loss, which is in the
         half-opened interval [0, num_classes + 1).
-       norm_by_times(bool, default false): Whether to normalize the gradients 
+       norm_by_times(bool, default false): Whether to normalize the gradients
-         by the number of time-step, which is also the sequence's length. 
+         by the number of time-step, which is also the sequence's length.
-         There is no need to normalize the gradients if warpctc layer was 
+         There is no need to normalize the gradients if warpctc layer was
         follewed by a mean_op.
    Returns:
@@ -3690,8 +3711,8 @@ def nce(input,
        input (Variable): input variable.
        label (Variable): label.
        num_total_classes (int):${num_total_classes_comment}
-        sample_weight (Variable|None): A Variable of shape [batch_size, 1] 
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
-            storing a weight for each sample. The default weight for each 
+            storing a weight for each sample. The default weight for each
            sample is 1.0.
        param_attr (ParamAttr|None): attributes for parameter
        bias_attr (ParamAttr|None): attributes for bias
@@ -4081,7 +4102,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
    For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of ouput Variable is 
+    and then sums all the losses. So the shape of ouput Variable is
    [batch_size, 1].
    Args:
@@ -4090,14 +4111,14 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
        y (Variable): A tensor with rank at least 2. The target value of smooth
            L1 loss op with same shape as :attr:`x`.
        inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If 
+            input is optional and should have same shape with :attr:`x`. If
-            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied 
+            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
            by this tensor element by element.
        outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If 
+            input is optional and should have same shape with :attr:`x`. If
-            provided, the out smooth L1 loss will be multiplied by this tensor 
+            provided, the out smooth L1 loss will be multiplied by this tensor
            element by element.
-        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float 
+        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
           scalar with default value 1.0.
    Returns:
@@ -4143,7 +4164,7 @@ def one_hot(input, depth):
    Examples:
        .. code-block:: python
            label = layers.data(name="label", shape=[1], dtype="float32")
            one_hot_label = layers.one_hot(input=label, depth=10)
    """
@@ -4297,10 +4318,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 def lod_reset(x, y=None, target_lod=None):
    """
    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
-    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be 
+    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be
-    considered as target LoD first, otherwise :attr:`y.data` would be 
+    considered as target LoD first, otherwise :attr:`y.data` would be
-    considered as target LoD. If :attr:`y` is not provided, target LoD should 
+    considered as target LoD. If :attr:`y` is not provided, target LoD should
-    be specified by :attr:`target_lod`. If target LoD is specified by 
+    be specified by :attr:`target_lod`. If target LoD is specified by
    :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
    .. code-block:: text
@@ -4354,7 +4375,7 @@ def lod_reset(x, y=None, target_lod=None):
    Args:
        x (Variable): Input variable which could be a Tensor or LodTensor.
-        y (Variable|None): If provided, output's LoD would be derived 
+        y (Variable|None): If provided, output's LoD would be derived
                           from :attr:`y`.
        target_lod (list|tuple|None): One level LoD which should be considered
                                      as target LoD when :attr:`y` not provided.
@@ -4670,7 +4691,7 @@ def image_resize(input,
    """
    **Resize a Batch of Images**
-    The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w),
    and the resizing only applies on the last two dimensions(hight and width).
    Supporting resample methods:
@@ -4766,9 +4787,9 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
    """
-    Resize a batch of images. The short edge of input images will be 
+    Resize a batch of images. The short edge of input images will be
-    resized to the given 'out_short_len'. The long edge of input images 
+    resized to the given 'out_short_len'. The long edge of input images
-    will be resized proportionately to make images' length-width ratio 
+    will be resized proportionately to make images' length-width ratio
    constant.
    Args:
@@ -4801,7 +4822,7 @@ def gather(input, index):
    """
    **Gather Layer**
-    Output is obtained by gathering entries of the outer-most dimension 
+    Output is obtained by gathering entries of the outer-most dimension
    of X indexed by `index` and concatenate them together.
    .. math::
@@ -4826,7 +4847,7 @@ def gather(input, index):
                       [5, 6]]
    Args:
-        input (Variable): The source input with rank>=1. 
+        input (Variable): The source input with rank>=1.
        index (Variable): The index input with rank=1.
    Returns:
@@ -4862,7 +4883,7 @@ def random_crop(x, shape, seed=None):
    Returns:
        ${out_comment}
    Examples:
        >>> img = fluid.layers.data("img", [3, 256, 256])
        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
@@ -4908,7 +4929,7 @@ def log(x):
        Out = \\ln(x)
    Args:
-        x (Variable): Input tensor. 
+        x (Variable): Input tensor.
    Returns:
        Variable: The natural log of the input tensor computed element-wise.
@@ -4937,7 +4958,7 @@ def relu(x):
        Out = \\max(0, x)
    Args:
-        x (Variable): The input tensor. 
+        x (Variable): The input tensor.
    Returns:
        Variable: The output tensor with the same shape as input.
@@ -4958,15 +4979,15 @@ def relu(x):
 def mean_iou(input, label, num_classes):
    """
    Mean Intersection-Over-Union is a common evaluation metric for
-    semantic image segmentation, which first computes the IOU for each 
+    semantic image segmentation, which first computes the IOU for each
-    semantic class and then computes the average over classes. 
+    semantic class and then computes the average over classes.
-    IOU is defined as follows: 
+    IOU is defined as follows:
    .. math::
        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
-    The predictions are accumulated in a confusion matrix and mean-IOU 
+    The predictions are accumulated in a confusion matrix and mean-IOU
    is then calculated from it.
@@ -4979,12 +5000,12 @@ def mean_iou(input, label, num_classes):
    Returns:
        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
    Examples:
        .. code-block:: python
            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
    """
    helper = LayerHelper('mean_iou', **locals())
@@ -5003,3 +5024,101 @@ def mean_iou(input, label, num_classes):
        },
        attrs={"num_classes": num_classes})
    return out_mean_iou, out_wrong, out_correct
+def crop(x, shape=None, offsets=None, name=None):
+    """
+    Crop input into output, as specified by offsets and shape.
+    .. code-block:: text
+        * Case 1:
+            Given
+                X = [[0, 1, 2, 0, 0]
+                     [0, 3, 4, 0, 0]
+                     [0, 0, 0, 0, 0]],
+            and
+                shape = [2, 2],
+                offsets = [0, 1],
+            output is:
+                Out = [[1, 2],
+                       [3, 4]].
+        * Case 2:
+            Given
+                X = [[0, 1, 2, 5, 0]
+                     [0, 3, 4, 6, 0]
+                     [0, 0, 0, 0, 0]],
+            and shape is tensor
+                shape = [[0, 0, 0]
+                         [0, 0, 0]]
+            and
+                offsets = [0, 1],
+            output is:
+                Out = [[1, 2, 5],
+                       [3, 4, 6]].
+    Args:
+        x (Variable): The input tensor variable.
+        shape (Variable|list/tuple of integer): The output shape is specified
+            by `shape`, which can a Variable or a list/tupe of integer.
+            If a tensor Variable, it's rank must be the same as `x`. This way
+            is suitable for the case that the output shape may be changed each
+            iteration. If a list/tupe of integer, it's length must be the same
+            as the rank of `x`
+        offsets (Variable|list/tuple of integer|None): Specifies the copping
+            offsets at each dimension. It can be a Variable or or a list/tupe
+            of integer. If a tensor Variable, it's rank must be the same as `x`.
+            This way is suitable for the case that the offsets may be changed
+            each iteration. If a list/tupe of integer, it's length must be the
+            same as the rank of `x`. If None, the offsets are 0 at each
+            dimension.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: The cropped tensor variable.
+    Raises:
+        ValueError: If shape is not a list, tuple or Variable.
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
+            y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
+            crop = fluid.layers.crop(x, shape=y)
+            # or
+            z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
+            crop = fluid.layers.crop(z, shape=[2, 3])
+    """
+    helper = LayerHelper('crop', **locals())
+    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
+        isinstance(shape, Variable)):
+        raise ValueError("The shape should be a list, tuple or Variable.")
+    if offsets is None:
+        offsets = [0] * len(x.shape)
+    out = helper.create_tmp_variable(x.dtype)
+    ipts = {'X': x}
+    attrs = {}
+    if isinstance(shape, Variable):
+        ipts['Y'] = shape
+    else:
+        attrs['shape'] = shape
+    if isinstance(offsets, Variable):
+        ipts['Offsets'] = offsets
+    else:
+        attrs['offsets'] = offsets
+    helper.append_op(
+        type='crop',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -230,7 +230,11 @@ def sums(input, out=None):
    helper = LayerHelper('sum', **locals())
    if out is None:
        out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+        helper.append_op(
+            type='sum',
+            inputs={'X': input},
+            outputs={'Out': out},
+            attrs={'use_mkldnn': False})
    return out
@@ -380,7 +384,7 @@ def argmin(x, axis=0):
    """
    **argmin**
-    This function computes the indices of the min elements 
+    This function computes the indices of the min elements
    of the input tensor's element along the provided axis.
    Args:
@@ -395,7 +399,7 @@ def argmin(x, axis=0):
        .. code-block:: python
          out = fluid.layers.argmin(x=in, axis=0)
-          out = fluid.layers.argmin(x=in, axis=-1)  
+          out = fluid.layers.argmin(x=in, axis=-1)
    """
    helper = LayerHelper("arg_min", **locals())
    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
@@ -411,7 +415,7 @@ def argmax(x, axis=0):
    """
    **argmax**
-    This function computes the indices of the max elements 
+    This function computes the indices of the max elements
    of the input tensor's element along the provided axis.
    Args:
@@ -426,7 +430,7 @@ def argmax(x, axis=0):
        .. code-block:: python
          out = fluid.layers.argmax(x=in, axis=0)
-          out = fluid.layers.argmax(x=in, axis=-1)  
+          out = fluid.layers.argmax(x=in, axis=-1)
    """
    helper = LayerHelper("arg_max", **locals())
    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
@@ -495,9 +499,9 @@ def reverse(x, axis):
    Args:
        x(Vairbale): the input to be reversed.
-        axis(int|tuple|list): Axis that along which order of elements 
+        axis(int|tuple|list): Axis that along which order of elements
-                    is reversed. If it is a tuple or a list, reversing 
+                    is reversed. If it is a tuple or a list, reversing
-                    will be apply on each axis in the tuple or list.  
+                    will be apply on each axis in the tuple or list.
    Returns:
        Variable: The reversed tensor.
@@ -528,9 +532,9 @@ def save(x, file_path, overwrite=True):
    Args:
        x(variable): The Tensor/LoDTensor to be saved.
        file_path(str): The file path where the variable will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
+        overwrite(bool): Whether or not cover the given file when it has already
-            existed. If it's set 'False' and the file is existed, a runtime 
+            existed. If it's set 'False' and the file is existed, a runtime
-            error will be thrown. 
+            error will be thrown.
    """
    helper = LayerHelper("save", **locals())
    helper.append_op(
@@ -550,8 +554,8 @@ def save_combine(x, file_path, overwrite=True):
                 a single file.
        file_path(str): The file path where variables will be saved.
        overwrite(bool): Whether or not cover the given file when it has already
-            existed. If it's set 'False' and the file is existed, a runtime 
+            existed. If it's set 'False' and the file is existed, a runtime
-            error will be thrown. 
+            error will be thrown.
    Returns:
        There is no return value.

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 __all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'Adadelta', 'ModelAverage', 'Optimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
 ]
@@ -192,15 +192,15 @@ class Optimizer(object):
        """Add optimization operators to update gradients to variables.
        Args:
-          loss: the target that this optimization is for.
+          loss(Variable): the target that this optimization is for.
-          parameters_and_grads: a list of (variable, gradient) pair to update.
+          parameters_and_grads(list(tuple(Variable, Variable))):
+          a list of (variable, gradient) pair to update.
        Returns:
          return_op_list: a list of operators that will complete one step of
          optimization. This will include parameter update ops, global step
          update ops and any other custom ops required by subclasses to manage
          their internal state.
-          :param startup_program:
        """
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
@@ -268,7 +268,22 @@ class Optimizer(object):
 class SGDOptimizer(Optimizer):
-    """ Simple SGD optimizer without any state.
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+    .. math::
+        param\_out = param - learning\_rate * grad
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+    Examples:
+        .. code-block:: python
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2)
+            sgd_optimizer.minimize(cost)
    """
    def __init__(self, learning_rate, **kwargs):
@@ -294,7 +309,37 @@ class SGDOptimizer(Optimizer):
 class MomentumOptimizer(Optimizer):
-    """Simple Momentum optimizer with velocity state
+    """
+    Simple Momentum optimizer with velocity state
+    This optimizer has a flag for Nestrov Momentum.
+    The update equations are as follows:
+    .. math::
+        & velocity = mu * velocity + gradient
+        & if (use\_nesterov):
+        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        & else:
+        &\quad   param = param - learning\_rate * velocity
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        momentum (float): momentum factor
+        use_nesterov (bool): enables Nesterov momentum
+    Examples:
+        .. code-block:: python
+            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
+            optimizer.minimize(cost)
    """
    _velocity_acc_str = "velocity"
@@ -338,7 +383,32 @@ class MomentumOptimizer(Optimizer):
 class AdagradOptimizer(Optimizer):
-    """Simple Adagrad optimizer with moment state
+    """
+    **Adaptive Gradient Algorithm (Adagrad)**
+    The update is done as follows:
+    .. math::
+        moment\_out &= moment + grad * grad
+        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    does not have the epsilon attribute. It is added here in our implementation
+    as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+    for numerical stability to avoid the division by zero error.
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        epsilon (float): a small float value for numerical stability.
+    Examples:
+        .. code-block:: python
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+            optimizer.minimize(cost)
    """
    _moment_acc_str = "moment"
@@ -379,7 +449,40 @@ class AdagradOptimizer(Optimizer):
 class AdamOptimizer(Optimizer):
-    """Implements the Adam Optimizer
+    """
+    This implements the Adam optimizer from Section 2 of the Adam
+    paper : https://arxiv.org/abs/1412.6980.
+    Adam is a first-order gradient-based optimization method based on
+    adaptive estimates of lower-order moments.
+    Adam updates:
+    .. math::
+        t & = t + 1
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+    Examples:
+        .. code-block:: python
+            optimizer = fluid.optimizer.Adam(learning_rate=0.2)
+            optimizer.minimize(cost)
    """
    _moment1_acc_str = "moment1"
    _moment2_acc_str = "moment2"
@@ -484,7 +587,42 @@ class AdamOptimizer(Optimizer):
 class AdamaxOptimizer(Optimizer):
-    """Implements the Adamax Optimizer
+    """
+    We implement the Adamax optimizer from Section 7 of the Adam
+    paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
+    Adam algorithm based on the infinity norm.
+    Adamax updates:
+    .. math::
+        t & = t + 1
+        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+    The original paper does not have an epsilon attribute.
+    However, it is added here for numerical stability to prevent the
+    division by 0 error.
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+    Examples:
+        .. code-block:: python
+            optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
+            optimizer.minimize(cost)
    """
    _moment_acc_str = "moment"
    _inf_norm_acc_str = "inf_norm"
@@ -568,7 +706,34 @@ class AdamaxOptimizer(Optimizer):
 class DecayedAdagradOptimizer(Optimizer):
-    """Simple Decayed Adagrad optimizer with moment state
+    """
+    **Decayed Adagrad Optimizer**
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    The update is done as follows:
+    .. math::
+        moment\_out & = decay * moment + (1 - decay) * grad * grad
+        param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    does not have an epsilon attribute. It is added here for numerical
+    stability to avoid the division by zero error.
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        decay (float): decay rate.
+        epsilon (float): a small float value for numerical stability.
+    Examples:
+        .. code-block:: python
+            optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
+            optimizer.minimize(cost)
    """
    _moment_acc_str = "moment"
@@ -614,6 +779,7 @@ class DecayedAdagradOptimizer(Optimizer):
 class AdadeltaOptimizer(Optimizer):
    """
    **Adadelta Optimizer**
    Simple Adadelta optimizer with average squared grad state and
    average squared update state.
    The details of adadelta please refer to this
@@ -628,7 +794,7 @@ class AdadeltaOptimizer(Optimizer):
        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
    Args:
-        learning_rate(float): global leraning rate
+        learning_rate(float): global learning rate
        rho(float): rho in equation
        epsilon(float): epsilon in equation
@@ -703,37 +869,37 @@ class RMSPropOptimizer(Optimizer):
    ..  math::
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
    The first equation calculates moving average of the squared gradient for
-    each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`.
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
    In some cases, adding a momentum term :math: `\\beta` is beneficial.
    In our implementation, Nesterov momentum is used:
    ..  math::
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
            \\epsilon}} \\nabla Q_{i}(w)
        w & = w - v(w, t)
-    where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
    smoothing term to avoid division by zero, usually set somewhere in range
    from 1e-4 to 1e-8.
    Args:
-        learning_rate(float): global leraning rate.
+        learning_rate(float): global learning rate.
        rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
            avoid division by zero, set 1e-6 by default.
-        momentum(float): :math: `\\beta` in equation is the momentum term,
+        momentum(float): :math:`\\beta` in equation is the momentum term,
            set 0.0 by default.
    Raises:
@@ -810,6 +976,113 @@ class RMSPropOptimizer(Optimizer):
        return rmsprop_op
+class FtrlOptimizer(Optimizer):
+    """
+    FTRL (Follow The Regularized Leader) Optimizer.
+    The paper that proposed Follow The Regularized Leader (FTRL):
+    (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+    ..  math::
+        &new\_accum = squared\_accum + grad^2
+        &if (lr\_power == -0.5):
+        &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
+        &else:
+        &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
+        &x = l1 * sign(linear\_accum) - linear\_accum
+        &if (lr\_power == -0.5):
+        &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
+        &\quad   pre\_shrink = \\frac{x}{y}
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+        &else:
+        &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
+        &\quad   pre\_shrink = \\frac{x}{y}
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+        &squared\_accum += grad^2
+    Args:
+        learning_rate (float|Variable): global learning rate.
+        l1 (float):
+        l2 (float):
+        lr_power (float):
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+    Examples:
+          .. code-block:: python
+              optimizer = fluid.optimizer.Ftrl(0.0001)
+              _, params_grads = optimizer.minimize(cost)
+    """
+    _squared_acc_str = "squared"
+    _linear_acc_str = "linear"
+    def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs):
+        super(FtrlOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        self.type = "ftrl"
+        self._l1 = l1
+        self._l2 = l2
+        self._lr_power = lr_power
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+        for p in parameters:
+            self._add_accumulator(self._squared_acc_str, p)
+            self._add_accumulator(self._linear_acc_str, p)
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+        squared_acc = self._get_accumulator(self._squared_acc_str,
+                                            param_and_grad[0])
+        linear_acc = self._get_accumulator(self._linear_acc_str,
+                                           param_and_grad[0])
+        ftrl_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "SquaredAccumulator": squared_acc,
+                "LinearAccumulator": linear_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "SquaredAccumOut": squared_acc,
+                "LinearAccumOut": linear_acc
+            },
+            attrs={"l1": self._l1,
+                   "l2": self._l1,
+                   "lr_power": self._lr_power})
+        return ftrl_op
 # We short the class name, since users will use the optimizer with the package
 # name. The sample code:
 #
@@ -826,6 +1099,7 @@ Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
+Ftrl = FtrlOptimizer
 class ModelAverage(Optimizer):
@@ -844,7 +1118,9 @@ class ModelAverage(Optimizer):
        max_average_window: The maximum size of average window.
    Examples:
-        ...
+      .. code-block:: python
        optimizer = fluid.optimizer.Momentum()
        _, params_grads = optimizer.minimize(cost)
        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None):
    counters/options for profiling by `config` argument. The default config
    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+    Then users can use NVIDIA Visual Profiler
+    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
+    this output file to visualize results.
    Args:
        output_file (string) : The output file name, the result will be
@@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None):
            Comma separated values format. It should be 'kvp' or 'csv'.
        config (list of string) : The profiler options and counters can refer
            to "Compute Command Line Profiler User Guide".
+    Raises:
+        ValueError: If `output_mode` is not in ['kvp', 'csv'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            output_file = 'cuda_profiler.txt'
+            with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NVIDIA Visual Profiler (nvvp) to load this output file
+            # to visualize results.
    """
    if output_mode is None:
        output_mode = 'csv'
@@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 def reset_profiler():
-    """The profiler clear interface.
+    """
-    reset_profiler will clear the previous time record.
+    Clear the previous time record. This interface does not work for
+    `fluid.profiler.cuda_profiler`, it only works for
+    `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
+    and `fluid.profiler.profiler`.
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            with profiler.profiler(state, 'total', '/tmp/profile'):
+                for iter in range(10):
+                    if iter == 2:
+                        profiler.reset_profiler()
+                    # ...
    """
    core.reset_profiler()
 def start_profiler(state):
-    """Enable the profiler.
+    """
+    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
    Args:
        state (string) : The profiling state, which should be 'CPU', 'GPU'
            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
            GPU as well. 'All' also generates timeline.
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
    """
    if core.is_profiler_enabled():
        return
@@ -97,7 +160,10 @@ def start_profiler(state):
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
-    """Stop the profiler.
+    """
+    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
    Args:
        sorted_key (string) : If None, the profiling results will be printed
@@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
            The `ave` means sorting by the average execution time.
        profile_path (string) : If state == 'All', it will write a profile
            proto output file.
+    Raises:
+        ValueError: If `sorted_key` is not in
+            ['calls', 'total', 'max', 'min', 'ave'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
    """
    if not core.is_profiler_enabled():
        return
@@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
    Different from cuda_profiler, this profiler can be used to profile both CPU
    and GPU program. By defalut, it records the CPU and GPU operator kernels,
    if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+    to add more records in C++ code.
+    If the state == 'All', a profile proto file will be written to
+    `profile_path`. This file records timeline information during the execution.
+    Then users can visualize this file to see the timeline, please refer 
+    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
    Args:
        state (string) : The profiling state, which should be 'CPU' or 'GPU',
@@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
            The `ave` means sorting by the average execution time.
        profile_path (string) : If state == 'All', it will write a profile
            proto output file.
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
+            not in ['calls', 'total', 'max', 'min', 'ave'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+                for pass_id in range(pass_num):
+                    for batch_id, data in enumerate(train_reader()):
+                        exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[],
+                                use_program_cache=True)
+                        # ...
    """
    start_profiler(state)
    yield

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -16,8 +16,8 @@ import framework
 from . import core
 __all__ = [
-    'append_regularization_ops', 'WeightDecayRegularizer', 'L1Decay', 'L2Decay',
+    'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
-    'L1DecayRegularizer', 'L2DecayRegularizer'
+    'L2DecayRegularizer'
 ]
@@ -36,7 +36,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
                        set. It will be applied with regularizer.
    Returns:
-        list of (parameters, gradients) pair with the regularized gradient
+        list[(Variable, Variable)]: list of (parameters, gradients) \
+        pair with the regularized gradient
    Raises:
        Exception: Unknown regularization type
@@ -100,6 +101,24 @@ class WeightDecayRegularizer(object):
 class L2DecayRegularizer(WeightDecayRegularizer):
    """Implements the L2 Weight Decay Regularization
+    Small values of L2 can help prevent over fitting the training data.
+    .. math::
+        L2WeightDecay = reg\_coeff * parameter
+    Args:
+        regularization_coeff(float): regularization coeff
+    Examples:
+        .. code-block:: python
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=1e-4,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.1))
+            optimizer.minimize(avg_cost)
    """
    def __init__(self, regularization_coeff=0.0):
@@ -154,6 +173,27 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 class L1DecayRegularizer(WeightDecayRegularizer):
    """Implements the L1 Weight Decay Regularization
+    L1 regularization encourages sparsity.
+    .. math::
+        L1WeightDecay = reg\_coeff * sign(parameter)
+    Args:
+        regularization_coeff(float): regularization coeff
+    Examples:
+        .. code-block:: python
+            program = fluid.framework.Program()
+            block = program.global_block()
+            mul_x = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="mul.x",
+                regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
    """
    def __init__(self, regularization_coeff=0.0):

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -194,16 +194,16 @@ def train(word_dict,
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True):
    if is_local:
        train_loop(framework.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -151,16 +151,16 @@ def train(nn_type,
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
    if is_local:
        train_loop(fluid.default_main_program())
    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":

--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -43,7 +43,7 @@ class TestConcatOp(OpTest):
        self.axis = 1
-class TestConcatOp2(OpTest):
+class TestConcatOp2(TestConcatOp):
    def init_test_data(self):
        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
@@ -51,5 +51,16 @@ class TestConcatOp2(OpTest):
        self.axis = 1
+class TestConcatOp3(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 256, 170, 256)).astype('float32')
+        self.x1 = np.random.random((1, 128, 170, 256)).astype('float32')
+        self.x2 = np.random.random((1, 128, 170, 256)).astype('float32')
+        self.axis = 1
+    def test_check_grad(self):
+        pass
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from test_gaussian_random_op import TestGaussianRandomOp
+class TestMKLDNN(TestGaussianRandomOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -25,7 +25,15 @@ class TestGaussianRandomOp(unittest.TestCase):
    def setUp(self):
        self.op_type = "gaussian_random"
        self.inputs = {}
-        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        self.attrs = {
+            "shape": [1000, 784],
+            "mean": .0,
+            "std": 1.,
+            "seed": 10,
+            "use_mkldnn": self.use_mkldnn
+        }
        self.outputs = ["Out"]
@@ -58,6 +66,9 @@ class TestGaussianRandomOp(unittest.TestCase):
        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
+    def init_kernel_type(self):
+        pass
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -401,6 +401,15 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))
+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 5], dtype="float32")
+            y = layers.data(name='y', shape=[2, 3], dtype="float32")
+            output = layers.crop(x, shape=y)
+            self.assertIsNotNone(output)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+class TestFtrlOptimizer(unittest.TestCase):
+    class MockFtrl(optimizer.FtrlOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+        def get_squared_str(self):
+            return self._squared_acc_str
+        def get_linear_str(self):
+            return self._linear_acc_str
+    def test_ftrl_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="mul.x",
+            optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        ftrl_optimizer = self.MockFtrl(
+            learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
+        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                       init_program)
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "ftrl"])
+        # Check accumulators
+        accumulators = ftrl_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators)
+        self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators)
+        squared_acc = accumulators[ftrl_optimizer.get_squared_str()]
+        linear_acc = accumulators[ftrl_optimizer.get_linear_str()]
+        self.assertEqual(len(squared_acc), 1)
+        self.assertEqual(len(linear_acc), 1)
+        self.assertTrue(mul_x.name in squared_acc)
+        self.assertTrue(mul_x.name in linear_acc)
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 3)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from test_sum_op import TestSumOp
+class TestMKLDNN(TestSumOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -20,12 +20,15 @@ from op_test import OpTest
 class TestSumOp(OpTest):
    def setUp(self):
        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
        x0 = np.random.random((3, 4)).astype('float32')
        x1 = np.random.random((3, 4)).astype('float32')
        x2 = np.random.random((3, 4)).astype('float32')
        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
        y = x0 + x1 + x2
        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
    def test_check_output(self):
        self.check_output()
@@ -33,6 +36,9 @@ class TestSumOp(OpTest):
    def test_check_grad(self):
        self.check_grad(['x0'], 'Out')
+    def init_kernel_type(self):
+        pass
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -824,7 +824,8 @@ class DistributeTranspiler:
            table_opt_block.append_op(
                type="sum",
                inputs={"X": pserver_side_table_grad_list},
-                outputs={"Out": [grad_var]})
+                outputs={"Out": [grad_var]},
+                attrs={"use_mkldnn": False})
        else:
            # in async_mode, for table gradient, it also need to be splited to each parameter server
            origin_grad_name = grad_var.name
@@ -1056,7 +1057,8 @@ class DistributeTranspiler:
            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
-                outputs={"Out": merged_var})
+                outputs={"Out": merged_var},
+                attrs={"use_mkldnn": False})
            # TODO(panyx0718): What if it's SELECTED_ROWS.
            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                optimize_block.append_op(

--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -336,7 +336,7 @@ def _buf2lines(buf, line_break="\n"):
 class PipeReader:
    """
-        PipeReader read data by stream from a command, take it's 
+        PipeReader read data by stream from a command, take it's
        stdout into a pipe buffer and redirect it to the parser to
        parse, then yield data as your desired format.
@@ -352,7 +352,7 @@ class PipeReader:
        An example:
        .. code-block:: python
           def example_reader():
               for f in myfiles:
                   pr = PipeReader("cat %s"%f)

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print all signature of a python module in alphabet order.
+Usage:
+    ./print_signature  "paddle.fluid" > signature.txt
+"""
+import importlib
+import inspect
+import collections
+import sys
+import pydoc
+member_dict = collections.OrderedDict()
+def visit_member(parent_name, member):
+    cur_name = ".".join([parent_name, member.__name__])
+    if inspect.isclass(member):
+        for name, value in inspect.getmembers(member):
+            if hasattr(value, '__name__') and (not name.startswith("_") or
+                                               name == "__init__"):
+                visit_member(cur_name, value)
+    elif callable(member):
+        try:
+            member_dict[cur_name] = inspect.getargspec(member)
+        except TypeError:  # special for PyBind method
+            member_dict[cur_name] = "  ".join([
+                line.strip() for line in pydoc.render_doc(member).split('\n')
+                if "->" in line
+            ])
+    else:
+        raise RuntimeError("Unsupported generate signature of member, type {0}".
+                           format(str(type(member))))
+def visit_all_module(mod):
+    for member_name in (
+            name
+            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
+            if not name.startswith("_")):
+        instance = getattr(mod, member_name, None)
+        if instance is None:
+            continue
+        if inspect.ismodule(instance):
+            visit_all_module(instance)
+        else:
+            visit_member(mod.__name__, instance)
+visit_all_module(importlib.import_module(sys.argv[1]))
+for name in member_dict:
+    print name, member_dict[name]