Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into argmin_argmax

c2a00f01 · sneaxiy · 4b6d584f · 831909ce · c2a00f01 · c2a00f01
48 changed file
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
 # API注释撰写标准

- [API注释模块](#API注释模块)
- [格式及示例](#格式及示例)
- [完整示例](#完整示例)
+- [API注释撰写标准](#api)
+    - [API注释模块](#api)
+    - [格式及示例](#)
+    - [完整示例](#)


 ## API注释模块
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接

 ## 完整示例

-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
 # API Doc Standard

- [API Doc Structure](#API Doc Structure)
- [Format and Examples](#Format and Examples)
- [Complete Example](#Complete Example)
+- [API Doc Standard](#api-doc-standard)
+    - [API Doc Structure](#api-doc-structure)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)


 ## API Doc Structure
@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f

 ## Complete Example

-Complete Example of fc please see [here](src/fc.py)。
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -109,7 +109,6 @@ void MainWord2Vec(bool use_gpu) {

 void MainImageClassification(bool use_gpu) {
  int batch_size = 2;
-  bool use_mkldnn = false;
  bool repeat = false;
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
@@ -134,12 +133,8 @@ void MainImageClassification(bool use_gpu) {
  std::vector<framework::LoDTensor*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

-  TestInference<platform::CPUPlace, false, true>(config.model_dir,
-                                                 cpu_feeds,
-                                                 cpu_fetchs1,
-                                                 repeat,
-                                                 is_combined,
-                                                 use_mkldnn);
+  TestInference<platform::CPUPlace, false, true>(
+      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);

  auto predictor = CreatePaddlePredictor(config);
  std::vector<PaddleTensor> paddle_tensor_feeds;

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -28,6 +28,9 @@ struct DataTypeMap {
 };

 static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 static DataTypeMap& gDataTypeMap() {
  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
  return *g_data_type_map_;

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -42,7 +42,7 @@ void FuseVarsOpHandle::RunImpl() {
    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
    s += numel;
  }
-  this->RunAndRecordEvent([this] {});
+  this->RunAndRecordEvent([] {});
 }

 std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(benchmark);
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");

 namespace paddle {
 namespace framework {
@@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
  auto ctx = Prepare(pdesc, block_id);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                   const std::string& feed_holder_name,
                   const std::string& fetch_holder_name) {
  platform::RecordBlock b(kProgramId);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
  bool has_feed_ops =
      has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
  bool has_fetch_ops =
@@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
    copy_program = unique_ptr_of_copy_program.get();
  }
-
  auto* global_block = copy_program->MutableBlock(0);

  if (!has_feed_ops) {
@@ -378,5 +380,19 @@ void Executor::RunPreparedContext(
  }
 }

+void Executor::EnableMKLDNN(const ProgramDesc& program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -81,6 +81,8 @@ class Executor {
                          const std::string& feed_holder_name = "feed",
                          const std::string& fetch_holder_name = "fetch");

+  void EnableMKLDNN(const ProgramDesc& program);
+
 private:
  const platform::Place place_;
 };

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 DEFINE_bool(skip_cpu, false, "Skip the cpu test");
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");

 TEST(inference, image_classification) {
  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -59,10 +58,8 @@ TEST(inference, image_classification) {
    // Run inference on CPU
    LOG(INFO) << "--- CPU Runs: ---";
    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
-    LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn;
    TestInference<paddle::platform::CPUPlace, false, true>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined,
-        FLAGS_use_mkldnn);
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
    LOG(INFO) << output1.dims();
  }


--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");

@@ -190,9 +189,6 @@ TEST(inference, nlp) {
    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
                                    /*model combined*/ false);
-    if (FLAGS_use_mkldnn) {
-      EnableMKLDNN(inference_program);
-    }
    // always prepare context
    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
    ctx = executor.Prepare(*inference_program, 0);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"

+DECLARE_bool(use_mkldnn);
+
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                 paddle::framework::DDim dims, T lower, T upper) {
@@ -133,24 +135,11 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
  return feed_target_shapes;
 }

-void EnableMKLDNN(
-    const std::unique_ptr<paddle::framework::ProgramDesc>& program) {
-  for (size_t bid = 0; bid < program->Size(); ++bid) {
-    auto* block = program->MutableBlock(bid);
-    for (auto* op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-}
-
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1, const bool is_combined = false,
-                   const bool use_mkldnn = false) {
+                   const int repeat = 1, const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
@@ -182,9 +171,6 @@ void TestInference(const std::string& dirname,
        "init_program",
        paddle::platform::DeviceContextPool::Instance().Get(place));
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
-    if (use_mkldnn) {
-      EnableMKLDNN(inference_program);
-    }
  }
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
@@ -210,7 +196,10 @@ void TestInference(const std::string& dirname,
    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
  }

-  // 6. Run the inference program
+  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
+  if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
+
+  // 7. Run the inference program
  {
    if (!CreateVars) {
      // If users don't want to create and destroy variables every time they

--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,24 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/arg_max_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"

-REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp,
+REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
                  paddle::operators::ArgMaxOpMaker,
                  paddle::framework::EmptyGradOpMaker);

 REGISTER_OP_CPU_KERNEL(
-    arg_max, paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                             float, int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double,
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int64_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int32_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int16_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, uint8_t,
-                                    int64_t>);
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/arg_max_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"

 REGISTER_OP_CUDA_KERNEL(
    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, double,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
                                    int64_t>,
    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t, int64_t>,
+                                    int32_t>,
    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t, int64_t>,
+                                    int16_t>,
    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t, int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, size_t,
-                                    int64_t>,
+                                    size_t>,
    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t, int64_t>);
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_max_op.h
+++ b/paddle/fluid/operators/arg_max_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
@@ -37,9 +38,9 @@ struct ArgMinMaxFunctor {};
  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
                          enum_argminmax_value> {                             \
    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor& out, int64_t axis) {                \
+                    framework::LoDTensor* out, int64_t axis) {                \
      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
-      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(out);     \
+      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);    \
      out_eigen.device(*(ctx.eigen_device())) =                               \
          in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
    }                                                                         \
@@ -62,7 +63,7 @@ class ArgMinMaxKernel : public framework::OpKernel<T> {
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, out, axis)
+  functor##rank(dev_ctx, x, &out, axis)

    switch (x.dims().size()) {
      case 1:
@@ -89,19 +90,20 @@ class ArgMinMaxKernel : public framework::OpKernel<T> {
            "than 6.",
            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
    }
  }
 };

-template <typename DeviceContext, typename T, typename Tout>
+template <typename DeviceContext, typename T>
 using ArgMinKernel =
-    ArgMinMaxKernel<DeviceContext, T, Tout, ArgMinMaxType::kArgMin>;
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;

-template <typename DeviceContext, typename T, typename Tout>
+template <typename DeviceContext, typename T>
 using ArgMaxKernel =
-    ArgMinMaxKernel<DeviceContext, T, Tout, ArgMinMaxType::kArgMax>;
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;

-typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel {
+class ArgMinMaxOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -121,7 +123,7 @@ typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel {
    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
    ctx->SetOutputDim("Out", framework::make_ddim(vec));
  }
-} ArgMinOp, ArgMaxOp;
+};

 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
 protected:
@@ -133,12 +135,13 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input tensor.");
    AddOutput("Out", "Output tensor.");
    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
-    AddComment(::paddle::string::Sprintf(R"DOC(
-				%s Operator.
+    AddComment(string::Sprintf(R"DOC(
+      %s Operator.

-				Computes the indices of the %s elements of the input tensor's element along the provided axis.
+      Computes the indices of the %s elements of the input tensor's element
+      along the provided axis.
 )DOC",
-                                         OpName(), Name()));
+                               OpName(), Name()));
  }
 };


--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,24 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/arg_min_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"

-REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinOp,
+REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
                  paddle::operators::ArgMinOpMaker,
                  paddle::framework::EmptyGradOpMaker);

 REGISTER_OP_CPU_KERNEL(
-    arg_min, paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                             float, int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double,
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int64_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int32_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int16_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, uint8_t,
-                                    int64_t>);
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/arg_min_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"

 REGISTER_OP_CUDA_KERNEL(
    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, double,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
                                    int64_t>,
    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t, int64_t>,
+                                    int32_t>,
    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t, int64_t>,
+                                    int16_t>,
    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t, int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, size_t,
-                                    int64_t>,
+                                    size_t>,
    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t, int64_t>);
+                                    uint8_t>);
--- a/paddle/fluid/operators/arg_min_op.h
+++ b/paddle/fluid/operators/arg_min_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() final {
-    AddInput("Input",
-             "(Tensor) Tensor "
-             "whose input_dim_idx'th dimension specifies the batch_size");
+    AddInput(
+        "Input",
+        "Tensor whose input_dim_idx'th dimension specifies the batch_size");
    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
+              "Tensor of specified shape will be filled "
              "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int>>("shape", "The shape of the output");
    AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
+                 "default 0. The index of input's batch size dimension")
        .SetDefault(0);
    AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
+                 "default 0. The index of output's batch size dimension")
        .SetDefault(0);
    Apply();
  }

--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
-             "(Tensor) The input tensor of bilinear interpolation, "
+             "The input tensor of bilinear interpolation, "
             "This is a 4-D tensor with shape of (N x C x h x w)");
    AddInput("OutSize",
-             "(Tensor) This is a 1-D tensor with two number. "
+             "This is a 1-D tensor with two number. "
             "The first number is height and the second number is width.")
        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor) The dimension of output is (N x C x out_h x out_w]");
+    AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");

-    AddAttr<int>("out_h", "(int) output height of bilinear interpolation op.");
-    AddAttr<int>("out_w", "(int) output width of bilinear interpolation op.");
+    AddAttr<int>("out_h", "output height of bilinear interpolation op.");
+    AddAttr<int>("out_w", "output width of bilinear interpolation op.");
    AddComment(R"DOC(
          Bilinear interpolation is an extension of linear interpolation for 
          interpolating functions of two variables (e.g. H-direction and 

--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -64,13 +64,21 @@ bool RequestSendHandler::Handle(const std::string& varname,
      return false;
    }
    if (invar->IsType<framework::SelectedRows>()) {
-      rpc_server_->RecordSparseVar(invar);
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+      sparse_vars_.push_back(invar);
    }
  }
-
  return true;
 }

+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
 bool RequestGetHandler::Handle(const std::string& varname,
                               framework::Scope* scope,
                               framework::Variable* invar,

--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -41,6 +41,11 @@ class RequestSendHandler final : public RequestHandler {
  virtual ~RequestSendHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
              framework::Variable* var, framework::Variable** outvar) override;
+  void ResetSparseVarRecorder();
+
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
 };

 class RequestGetHandler final : public RequestHandler {

--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -73,19 +73,6 @@ void RPCServer::ResetBarrierCounter() {
    t.second = 0;
  }
 }
-void RPCServer::RecordSparseVar(framework::Variable* sparse_var) {
-  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
-  sparse_vars_.push_back(sparse_var);
-}
-
-void RPCServer::ResetSparseVarsRecorder() {
-  VLOG(3) << "RPCServer reset sparse vars recorder.";
-  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
-  for (auto* var : sparse_vars_) {
-    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-  }
-  sparse_vars_.clear();
-}

 void RPCServer::RegisterRPC(const std::string& rpc_name,
                            RequestHandler* handler, int thread_num) {

--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -62,8 +62,6 @@ class RPCServer {
  void IncreaseBatchBarrier(const std::string rpc_name);

  void ResetBarrierCounter();
-  void RecordSparseVar(framework::Variable* sparse_var);
-  void ResetSparseVarsRecorder();

 protected:
  virtual void ShutDownImpl() = 0;
@@ -77,9 +75,6 @@ class RPCServer {
  std::atomic<int> cur_cond_;
  std::condition_variable rpc_cond_;

-  std::vector<framework::Variable*> sparse_vars_;
-  std::mutex mutex_sparse_var_recorder_;
-
 protected:
  std::string bind_address_;
  std::atomic<int> exit_flag_;

--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
 protected:
  void Apply() override {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
+    AddAttr<int>(
+        "dtype",
+        "It could be numpy.dtype. Output data type. Default is float32")
        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
+    AddAttr<float>("value", "default 0. The value to be filled")
        .SetDefault(0.0f);
    AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
+This function creates a tensor of specified *shape*, *dtype* and batch size,
+and initializes this with a constant supplied in *value*. The batch size is
+obtained from the `input` tensor.

 )DOC");
  }

--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -43,7 +43,8 @@ TEST(Gather, GatherData) {
  auto* cpu_place = new paddle::platform::CPUPlace();
  paddle::platform::CPUDeviceContext ctx(*cpu_place);
  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
-
+  delete cpu_place;
+  cpu_place = NULL;
  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);


--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
        "The output is no longer a LoDTensor.");
    AddComment(R"DOC(
-LinearChainCRF Operator.
-
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability $P(Y|X)$, where

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -146,7 +146,9 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
    rpc_service_->SetCond(detail::kRequestGet);
    rpc_service_->WaitBarrier(detail::kRequestGet);
    rpc_service_->ResetBarrierCounter();
-    rpc_service_->ResetSparseVarsRecorder();
+    // reset received sparse vars to avoid reuse it in the next mini-batch
+    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+        ->ResetSparseVarRecorder();
  }  // while(true)
 }


--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase {
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddOutput("Out", "The tensor need to be loaded");
    AddAttr<bool>(
        "load_as_fp16",
-        "(boolean, default false)"
        "If true, the tensor will be first loaded and then "
        "converted to float16 data type. Otherwise, the tensor will be "
-        "directly loaded without data type conversion.")
+        "directly loaded without data type conversion. Default is false.")
        .SetDefault(false);
    AddAttr<std::string>("file_path",
-                         "(string) "
-                         "Variable will be loaded from \"file_path\".")
+                         R"(Variable will be loaded from "file_path")")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
-Load Operator.
-
-Load operator will load a tensor variable from disk file.
-
-)DOC");
+    AddComment("Load operator will load a tensor variable from disk file.");
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) {
  paddle::platform::CPUDeviceContext context(*cpu_place);
  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  delete cpu_place;
+  cpu_place = NULL;

  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);

--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("RankTable", "The lod_rank_table.");
-    AddOutput("Out", "The max sequence length.");
-    AddComment(
-        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+    AddInput("RankTable", "Input variable which is a LoDRankTable object");
+    AddOutput("Out", "The max sequence length");
+    AddComment(R"DOC(
+    Given a LoDRankTable object, this layer returns the max length of
+    a batch of sequences. In fact, a LoDRankTable object contains a list of
+    tuples(<sequence index, sequence length>) and the list is already sorted by
+    sequence length in descending order, so the operator just returns the
+    sequence length of the first tuple element
+)DOC");
  }
 };


--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
  new_argv.push_back(
      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
 #endif
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent
 from trainer import EndEpochEvent
 from trainer import BeginStepEvent
 from trainer import EndStepEvent
+from trainer import CheckpointConfig

 import inferencer
 from inferencer import Inferencer
@@ -116,7 +117,7 @@ def __bootstrap__():

    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope'
+        'eager_delete_scope', 'use_mkldnn'
    ]
    if core.is_compiled_with_cuda():
        read_env_flags += [

--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -56,6 +56,8 @@ class Inferencer(object):
        else:
            self.exe = executor.Executor(self.place)

+        self.inference_program = self.inference_program.clone(for_test=True)
+
    def infer(self, inputs, return_numpy=True):
        """
        :param inputs: a map of {"input_name": input_var} that will be feed into the inference program

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -24,7 +24,8 @@ __all__ = [
    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
    'load_persistables', 'save_inference_model', 'load_inference_model',
    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
-    'clean_checkpoint'
+    'clean_checkpoint', 'load_persist_vars_without_grad',
+    'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
 ]


@@ -457,95 +458,161 @@ def get_parameter_value_by_name(name, executor, program=None):

 SUCCESS_MARK_FILENAME = "_SUCCESS"
 CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+TRAINER_PREFIX = "trainer"
 CHECKPOINT_SEPARATOR = "_"


 def save_checkpoint(executor,
-                    checkpoint_dir=None,
-                    max_num_checkpoints=3,
-                    save_interval_secs=600,
-                    main_program=None):
+                    checkpoint_dir,
+                    trainer_id,
+                    trainer_args=None,
+                    main_program=None,
+                    max_num_checkpoints=3):
    """
    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
    The interval between two saved checkpoints must greater than save_interval_secs.

-    :param executor
-    :param checkpoint_dir
-    :param max_num_checkpoints
-    :param save_interval_secs
-    :param main_program
+    :param executor executor for save the value
+    :param checkpoint_dir the checkpoint directory 
+    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
+    :param main_program   will save all variables in program 
+    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
    """
    if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if trainer_args:
+        assert isinstance(trainer_args, dict)

    if not os.path.isdir(checkpoint_dir):
        os.makedirs(checkpoint_dir)

-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
-    if serial >= 0 and not _interval_secs_exceed(
-            _get_serial_dir(serial, checkpoint_dir), save_interval_secs):
-        return
+    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)

-    serial += 1
-    cur_dir = _get_serial_dir(serial, checkpoint_dir)
+    save_trainer_args(cur_dir, trainer_id, trainer_args)

-    save_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=main_program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-    _write_success(cur_dir)
-    _lru_delete(checkpoint_dir, max_num_checkpoints)
+    if trainer_id == 0:
+        save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)


-def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
+def load_checkpoint(executor, checkpoint_dir, serial, main_program):
    """
    Load checkpoint from a directory by executor,
    it will find  the most recent saved checkpoint file and load it auto.

-    :param executor
-    :param checkpoint_dir
-    :param main_program
+    :param executor executor for load the value
+    :param checkpoint_dir  the checkpoint directory 
+    :param serial the serial folder in checkpoint directory will be load
+    :param main_program  will load all variables in program 
    """

    if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("'checkpoint_dir' should not be None")

-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    if serial is None or serial < 0:
+        raise ValueError("'serial' should not be None or <0 ")

-    if serial < 0:
-        return
+    if main_program is None:
+        raise ValueError('main_program should not be None.')

-    cur_dir = _get_serial_dir(serial, checkpoint_dir)
-
-    load_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=main_program,
-        predicate=_is_checkpoint_var,
-        filename=None)
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    load_persist_vars_without_grad(executor, cur_dir, main_program, True)


 def clean_checkpoint(checkpoint_dir, delete_dir=False):
    """
    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+
+    :param checkpoint_dir
+    :param delete_dir
    """
+
    if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
-    _lru_delete(checkpoint_dir, max_num_checkpoints=0)
+        raise ValueError("'checkpoint_dir' should not be None")
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)

    if delete_dir and not os.listdir(checkpoint_dir):
        os.rmdir(checkpoint_dir)


-def _get_serial_dir(serial, checkpoint_dir):
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    return os.path.join(checkpoint_dir, serial_folder)
+def load_persist_vars_without_grad(executor,
+                                   dirname,
+                                   program,
+                                   has_model_dir=False):
+    """
+    load_persist_vars_without_grad will load variables from a directory by an executor,
+    the variable named end with "@GRAD" will not be loaded.
+
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program 
+    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
+    """
+
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def save_persist_vars_without_grad(executor, dirname, program):
+    """
+    save_persist_vars_without_grad  will save variables to a directory by an executor,
+    the variable named end with "@GRAD" will not be saved.
+
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program
+    """
+    cur_dir = _get_model_dir(dirname)
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in trainer_args.iteritems():
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    assert isinstance(trainer_args, list)
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values


 def _is_checkpoint_var(var):
@@ -559,36 +626,74 @@ def _is_checkpoint_var(var):
            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
            var.desc.type() == core.VarDesc.VarType.RAW:
        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False

-    if var.name.endswith("@GRAD"):
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
        return False

    return var.persistable


-def _interval_secs_exceed(dirname, save_interval_secs):
-    dir_time = os.path.getmtime(dirname)
-    if save_interval_secs > (time.time() - dir_time):
-        return False
-    return True
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+
+    if not os.path.isdir(serial_dir):
+        os.makedirs(serial_dir)
+
+    return serial_dir
+

+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)

-def _lru_delete(dirname, max_num_checkpoints=3):
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+
+    return model_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+
+    if not os.path.isdir(trainer_dir):
+        os.makedirs(trainer_dir)
+
+    return trainer_dir
+
+
+def _scroll_delete(dirname, max_num_checkpoints=3):
    dirs = os.listdir(dirname)
-    serials = []
+    serial_map = {}
    for serial in dirs:
-        try:
-            serials.append(int(serial))
-        except ValueError:
-            continue
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial

-    if len(serials) <= max_num_checkpoints:
+    if len(serial_map.keys()) <= max_num_checkpoints:
        return

+    serials = serial_map.keys()
    serials.sort(reverse=True)
    serials = serials[max_num_checkpoints:]
    for serial in serials:
-        cur_dir = os.path.join(dirname, str(serial))
+        cur_dir = _get_serial_dir(dirname, serial)
        shutil.rmtree(cur_dir)


@@ -604,33 +709,30 @@ def _write_success(dirname):
        f.write(now)


-def _get_lastest_checkpoint_dir(checkpoint_dir):
+def get_latest_checkpoint_serial(checkpoint_dir):
    """
    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory

    :param checkpoint_dir
    """
-    if not checkpoint_dir.strip():
+    if not checkpoint_dir:
        return -1

    def has_success(checkpoint_dir, cur_dir):
        """
        is _SUCCESS in this dir
        """
-        _, serial = cur_dir.split(CHECKPOINT_SEPARATOR)
-
-        try:
-            int(serial)
-        except ValueError:
-            return -1

-        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
            return -1

        success_path = os.path.join(
-            _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME)
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
        if os.path.isfile(success_path):
-            return int(serial)
+            return serial

    if not os.path.isdir(checkpoint_dir):
        return -1

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib

-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0):
    return table


+@templatedoc()
 def max_sequence_len(rank_table):
-    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
-    returns the max length of a batch of sequences. In fact, a LoDRankTable
-    object contains a list of tuples(<sequence index, sequence length>) and
-    the list is already sorted by sequence length in descending order, so the
-    operator just returns the sequence length of the first tuple element.
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
+    >>>                       lod_level=1)
+    >>> rank_table = layers.lod_rank_table(x=x, level=0)
+    >>> max_seq_len = layers.max_sequence_len(rank_table)

    Args:
-        rank_table (Variable): Input variable which is a LoDRankTable object.
+        rank_table(${rank_table_type}): ${rank_table_comment}.

    Returns:
-        Variable: The max length of sequence.
-
-    Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
-            rank_table = layers.lod_rank_table(x=x, level=0)
-            max_seq_len = layers.max_sequence_len(rank_table)
+        ${out_comment}.
    """
    helper = LayerHelper("max_seqence_len", **locals())
    res = helper.create_tmp_variable(dtype="int64")

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
 from ..executor import global_scope
+from layer_function_generator import generate_layer_fn, templatedoc

 __all__ = [
    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'Preprocessor'
+    'random_data_generator', 'Preprocessor', 'load'
 ]


@@ -662,3 +663,29 @@ class Preprocessor(object):
                "sink_var_names": self.sink_var_names
            })
        return monkey_patch_reader_methods(self.reader)
+
+
+@templatedoc()
+def load(out, file_path, load_as_fp16=None):
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
+    >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
+
+    Args:
+        out(${out_type}): ${out_comment}.
+
+        file_path(${file_path_type}): ${file_path_comment}.
+
+        load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
+
+    Returns:
+        None
+    """
+    helper = LayerHelper("load", **locals())
+    attrs = {"file_path": file_path}
+    if load_as_fp16 is not None:
+        attrs['load_as_fp16'] = load_as_fp16
+    helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs)
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -224,7 +224,10 @@ def autodoc(comment=""):
    return __impl__


-def templatedoc():
+_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$")
+
+
+def templatedoc(op_type=None):
    """
    Decorator of layer function. It will use the docstring from the layer
    function as the template. The template arguments are:
@@ -238,32 +241,47 @@ def templatedoc():
        Decorated function.
    """

+    def trim_ending_dot(msg):
+        return msg.rstrip('.')
+
+    def escape_inline_math(msg):
+        return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg)
+
    def __impl__(func):
-        op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+        if op_type is None:
+            op_type_name = func.__name__
+        else:
+            op_type_name = op_type
+        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
        tmpl = string.Template(func.__doc__)

        comment_lines = op_proto.comment.split("\n")
        comment = ""
        for line in comment_lines:
-            line = line.lstrip()
-            comment += line
-            comment += "\n"
-
-        args = {"comment": comment}
+            line = line.strip()
+            if len(line) != 0:
+                comment += escape_inline_math(line)
+                comment += " "
+            elif len(comment) != 0:
+                comment += "\n    \n    "
+
+        args = {"comment": trim_ending_dot(comment)}
        for each_input in op_proto.inputs:
            input_name = _convert_(each_input.name)
-            args["{0}_comment".format(input_name)] = each_input.comment
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_input.comment)
            args["{0}_type".format(input_name)] = "Variable"
        for each_attr in op_proto.attrs:
            input_name = _convert_(each_attr.name)
-            args["{0}_comment".format(input_name)] = each_attr.comment
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_attr.comment)
            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)

        for each_opt in op_proto.outputs:
            output_name = _convert_(each_opt.name)
-            args["{0}_comment".format(output_name)] = each_opt.comment
+            args["{0}_comment".format(output_name)] = trim_ending_dot(
+                each_opt.comment)
            args["{0}_type".format(output_name)] = "Variable"
-
        func.__doc__ = tmpl.substitute(args)
        return func


--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -11,6 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""

 import control_flow
 import nn
@@ -22,14 +30,6 @@ __all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
    'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
-"""
-When training a model, it's often useful to decay the
-learning rate during training process, this is called
-learning_rate_decay. There are many strategies to do
-this, this module will provide some classical method.
-User can also implement their own learning_rate_decay
-strategy according to this module.
-"""


 def _decay_step_counter(begin=0):
@@ -41,18 +41,20 @@ def _decay_step_counter(begin=0):


 def noam_decay(d_model, warmup_steps):
-    """Apply decay to learning rate.
-    ```python
-    lr_value = np.power(d_model, -0.5) * np.min([
-            np.power(current_steps, -0.5),
-            np.power(warmup_steps, -1.5) * current_steps
-        ])
-    ```
+    """
+    Noam decay method. The numpy implementation of noam decay as follows.
+
+    >>> import numpy as np
+    >>> lr_value = np.power(d_model, -0.5) * np.min([
+    >>>                         np.power(current_steps, -0.5),
+    >>>                         np.power(warmup_steps, -1.5) * current_steps])
+
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.

    Args:
        d_model(Variable): The dimensionality of input and output of model.
-            Reference: attention is all you need
-                https://arxiv.org/pdf/1706.03762.pdf
+
        warmup_steps(Variable): A super parameter.

    Returns:

--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200):
    topk_indices = helper.create_tmp_variable(dtype="int64")
    topk_out, topk_indices = nn.topk(input, k=k)
    auc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="accuracy",
        inputs={

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4037,18 +4037,25 @@ def image_resize(input,
    return out


+@templatedoc(op_type="bilinear_interp")
 def resize_bilinear(input, out_shape=None, scale=None, name=None):
    """
-    This is an alias of layer 'image_resize' with bilinear interpolation.
+    ${comment}
+
+    Args:
+        input(${x_type}): ${x_comment}.
+
+        out_shape(${out_size_type}): ${out_size_comment}.

-    The mathematical meaning of resize bilinear layer is
-    Bilinear interpolation.
-    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this layer) on a rectilinear 2D grid.
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+
+        name(str|None): The output variable name.
+
+    Returns:

-    For details, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Bilinear_interpolation
+        ${out_comment}.
    """

    return image_resize(input, out_shape, scale, name, 'BILINEAR')

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -73,6 +73,7 @@ __all__ = [
    'sum',
    'polygon_box_transform',
    'shape',
+    'maxout',
 ] + __activations__

 for _OP in set(__all__):

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from layer_function_generator import templatedoc
 import numpy

 __all__ = [
@@ -268,6 +269,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
    return out


+@templatedoc()
 def fill_constant_batch_size_like(input,
                                  shape,
                                  dtype,
@@ -275,30 +277,28 @@ def fill_constant_batch_size_like(input,
                                  input_dim_idx=0,
                                  output_dim_idx=0):
    """
-    **fill_constant_batch_size_like**
-
-    This function creates a tensor of specified *shape*, *dtype* and batch size,
-    and initializes this with a constant supplied in *value*. The batch size is
-    obtained from the `input` tensor.
+    ${comment}

    It also sets *stop_gradient* to True.

+    >>> data = fluid.layers.fill_constant_batch_size_like(
+    >>>             input=like, shape=[1], value=0, dtype='int64')
+
    Args:
-        input(Variable): Tensor whose dimensions will be used to get batch size
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
-        value(float): Constant value to initialize the output tensor
-        input_dim_idx(int): Index of input's batch size dimension
-        output_dim_idx(int): Index of output's batch size dimension
+        input(${input_type}): ${input_comment}.

-    Returns:
-        Variable: The tensor variable storing the output
+        shape(${shape_type}): ${shape_comment}.

-    Examples:
-        .. code-block:: python
+        dtype(${dtype_type}): ${dtype_comment}.
+
+        value(${value_type}): ${value_comment}.

-          data = fluid.layers.fill_constant_batch_size_like(
-              input=like, shape=[1], value=0, dtype='int64')
+        input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}.
+
+        output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
+
+    Returns:
+        ${out_comment}.
    """
    helper = LayerHelper("fill_constant_batch_size_like", **locals())
    out = helper.create_tmp_variable(dtype=dtype)
@@ -501,22 +501,6 @@ def save_combine(x, file_path, overwrite=True):
              "overwrite": overwrite})


-def load(out, file_path):
-    """
-    Loads a variable from a given file.
-
-    Args:
-        out(variable): The variable to be read from the disk file.
-        file_path(str): The path of the disk file.
-    """
-    helper = LayerHelper("load", **locals())
-    helper.append_op(
-        type="load",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
-
-
 def load_combine(out, file_path):
    """
    Loads a list of vairables from a single file.

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -38,7 +38,7 @@ def inference_program():
    return y_predict


-def linear():
+def train_program():
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
    y_predict = inference_program()

@@ -104,7 +104,7 @@ def main(use_cuda):
    # Directory for saving the trained model
    params_dirname = "fit_a_line.inference.model"

-    train(use_cuda, linear, params_dirname)
+    train(use_cuda, train_program, params_dirname)
    infer(use_cuda, inference_program, params_dirname)



--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import os
+import tempfile
+
+
+class TestCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.dirname = tempfile.mktemp()
+        self.max_num_checkpoints = 3
+        self.epoch_interval = 1
+        self.step_interval = 1
+        self.trainer_id = 0
+        self.chief = self.trainer_id == 0
+        self.place = fluid.CPUPlace()
+        self.epoch_id = 100
+        self.step_id = 20
+
+    def test_checkpoint(self):
+        self.save_checkpoint()
+        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
+        self.assertTrue(serial >= 0)
+        trainer_args = ["epoch_id", "step_id"]
+        epoch_id, step_id = fluid.io.load_trainer_args(
+            self.dirname, serial, self.trainer_id, trainer_args)
+        self.assertEqual(self.step_id, int(step_id))
+        self.assertEqual(self.epoch_id, int(epoch_id))
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            exe = fluid.Executor(self.place)
+            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
+
+        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
+        self.assertFalse(os.path.isdir(self.dirname))
+
+    def save_checkpoint(self):
+        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
+                                        self.epoch_interval, self.step_interval)
+
+        trainer_args = {}
+        trainer_args["epoch_id"] = self.epoch_id
+        trainer_args["step_id"] = self.step_id
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            program.global_block().create_var(
+                name="scale_0",
+                psersistable=True,
+                dtype="float32",
+                shape=[32, 32])
+
+            exe = fluid.Executor(self.place)
+            for i in xrange(10):
+                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
+                                         self.trainer_id, trainer_args, program,
+                                         config.max_num_checkpoints)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -30,9 +30,6 @@ class Memory(object):
        assert val.dtype == self.ex.dtype
        self.cur = val

-    def ex(self):
-        return self.ex
-
    def next(self):
        self.ex = self.cur
        self.cur = None

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -387,6 +387,14 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))

+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[8, 6, 6], dtype="float32")
+            output = layers.maxout(x=data, groups=2)
+            self.assertIsNotNone(output)
+        print(str(program))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -27,11 +27,8 @@ import parallel_executor
 from transpiler import distribute_transpiler

 __all__ = [
-    'Trainer',
-    'BeginEpochEvent',
-    'EndEpochEvent',
-    'BeginStepEvent',
-    'EndStepEvent',
+    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
+    'EndStepEvent', 'CheckpointConfig'
 ]


@@ -59,6 +56,35 @@ class EndStepEvent(object):
        self.metrics = metrics


+class CheckpointConfig(object):
+    def __init__(self,
+                 checkpoint_dir=None,
+                 max_num_checkpoints=3,
+                 epoch_interval=1,
+                 step_interval=10):
+        if checkpoint_dir is None:
+            self.checkpoint_dir = os.getcwd()
+        else:
+            self.checkpoint_dir = checkpoint_dir
+
+        self.max_num_checkpoints = max_num_checkpoints
+
+        if epoch_interval < 1:
+            self.epoch_interval = 1
+        else:
+            self.epoch_interval = epoch_interval
+
+        if step_interval < 1:
+            self.step_interval = 10
+        else:
+            self.step_interval = step_interval
+
+        self.epoch_id = 0
+        self.step_id = 0
+        self.load_serial = None
+        self.is_pserver = False
+
+
 def check_and_get_place(place):
    """
    Check the type of place or get the default place
@@ -99,13 +125,24 @@ class Trainer(object):
                 optimizer_func,
                 param_path=None,
                 place=None,
-                 parallel=False):
+                 parallel=False,
+                 checkpoint_config=None):
        self.__stop = False
        self.parallel = parallel
        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py

+        # config for checkpoint
+        # only chief worker will save variables
+        self.trainer_id = 0
+        self.checkpoint_cfg = checkpoint_config
+        if self.checkpoint_cfg:
+            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
+            serial = io.get_latest_checkpoint_serial(
+                self.checkpoint_cfg.checkpoint_dir)
+            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
+
        self.scope = core.Scope()

        self.startup_program = framework.Program()
@@ -115,9 +152,9 @@ class Trainer(object):
            program_func_outs = train_func()
            self.train_func_outputs = program_func_outs if isinstance(
                program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone()
+            self.test_program = self.train_program.clone(for_test=True)

-            # The fisrt element of program_func_outs is loss.
+            # The first element of program_func_outs is loss.
            loss = self.train_func_outputs[0]

            optimizer = optimizer_func()
@@ -137,9 +174,25 @@ class Trainer(object):
            exe = executor.Executor(place)
            exe.run(self.startup_program)

-        if param_path:
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(place)
+                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
+                                   self.checkpoint_cfg.load_serial,
+                                   self.startup_program)
+
+            if not self.checkpoint_cfg.is_pserver:
+                epoch_id, step_id = io.load_trainer_args(
+                    self.checkpoint_cfg.checkpoint_dir,
+                    self.checkpoint_cfg.load_serial, self.trainer_id,
+                    self._get_checkpoint_load_args())
+                self.checkpoint_cfg.epoch_id = int(epoch_id)
+                self.checkpoint_cfg.step_id = int(step_id)
+
+        if param_path and os.path.isdir(param_path):
            # load params from param_path into scope
-            io.load_persistables(exe, dirname=param_path)
+            io.load_persist_vars_without_grad(
+                exe, dirname=param_path, program=self.startup_program)

    def _transpile_nccl2_dist(self):
        # PADDLE_TRAINER_IPS
@@ -194,14 +247,18 @@ class Trainer(object):
        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
        # the unique trainer id, starting from 0, needed by trainer
        # only
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+
        # the role, should be either PSERVER or TRAINER
        training_role = os.getenv("PADDLE_TRAINING_ROLE")
        with self._prog_and_scope_guard():
            t = distribute_transpiler.DistributeTranspiler()
            t.transpile(
-                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
            if training_role == "PSERVER":
+                if self.checkpoint_cfg:
+                    self.is_pserver = True
+
                self.train_program = t.get_pserver_program(current_endpoint)
                self.startup_program = t.get_startup_program(current_endpoint,
                                                             self.train_program)
@@ -294,11 +351,26 @@ class Trainer(object):
            self._train_by_any_executor(event_handler, exe, num_epochs, reader)

    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        for epoch_id in range(num_epochs):
+        if self.checkpoint_cfg:
+            epochs = [
+                epoch_id for epoch_id in range(num_epochs)
+                if epoch_id >= self.checkpoint_cfg.epoch_id
+            ]
+        else:
+            epochs = [epoch_id for epoch_id in range(num_epochs)]
+
+        for epoch_id in epochs:
            event_handler(BeginEpochEvent(epoch_id))
            for step_id, data in enumerate(reader()):
                if self.__stop:
+                    if self.checkpoint_cfg:
+                        self._clean_checkpoint()
                    return
+
+                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
+                    and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
+                    continue
+
                begin_event = BeginStepEvent(epoch_id, step_id)
                event_handler(begin_event)
                if begin_event.fetch_metrics:
@@ -309,8 +381,13 @@ class Trainer(object):
                                      ])
                else:
                    metrics = exe.run(feed=data, fetch_list=[])
+
+                if self.checkpoint_cfg:
+                    self._save_checkpoint(epoch_id, step_id)
                event_handler(EndStepEvent(epoch_id, step_id, metrics))
            event_handler(EndEpochEvent(epoch_id))
+        if self.checkpoint_cfg:
+            self._clean_checkpoint()

    def _test_by_executor(self, reader, feed_order, fetch_list):
        with executor.scope_guard(self.scope):
@@ -349,6 +426,38 @@ class Trainer(object):
                loss_name=self.train_func_outputs[0].name)
        return self._get_parallel_executor()

+    def _clean_checkpoint(self):
+        assert self.checkpoint_cfg
+        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+
+    def _get_checkpoint_load_args(self):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
+        """
+        return ["epoch_id", "step_id"]
+
+    def _get_checkpoint_save_args(self, epoch_id, step_id):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
+        """
+        trainer_args = {}
+        trainer_args["epoch_id"] = epoch_id
+        trainer_args["step_id"] = step_id
+        return trainer_args
+
+    def _save_checkpoint(self, epoch_id, step_id):
+        assert self.checkpoint_cfg
+
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0:
+            exe = executor.Executor(self.place)
+            io.save_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                trainer_id=self.trainer_id,
+                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
+                main_program=self.train_program,
+                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
+

 def build_feed_var_list(program, feed_order):
    if not isinstance(program, framework.Program):

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -177,6 +177,7 @@ class DistributeTranspiler:
                        dtype=table_grad_var.dtype)
                    for index in range(len(self.pserver_endpoints))
                ]
+        return param_list, grad_list

    def _init_splited_vars(self, slice_var_up):
        # update these mappings for further transpile:
@@ -199,8 +200,8 @@ class DistributeTranspiler:
                grad_list.append(g)
                param_grad_set.add(g.name)

-        self._update_dist_lookup_table_vars(param_list, grad_list,
-                                            self.params_grads)
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)

        if slice_var_up:
            # when we slice var up into blocks, we will slice the var according to